Coverage for sources/librovore/structures/mkdocs/conversion.py: 0%

191 statements  

« prev     ^ index     » next       coverage.py v7.10.4, created at 2025-08-17 23:43 +0000

1# vim: set filetype=python fileencoding=utf-8: 

2# -*- coding: utf-8 -*- 

3 

4#============================================================================# 

5# # 

6# Licensed under the Apache License, Version 2.0 (the "License"); # 

7# you may not use this file except in compliance with the License. # 

8# You may obtain a copy of the License at # 

9# # 

10# http://www.apache.org/licenses/LICENSE-2.0 # 

11# # 

12# Unless required by applicable law or agreed to in writing, software # 

13# distributed under the License is distributed on an "AS IS" BASIS, # 

14# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # 

15# See the License for the specific language governing permissions and # 

16# limitations under the License. # 

17# # 

18#============================================================================# 

19 

20 

21''' HTML to markdown conversion for MkDocs content. ''' 

22 

23 

24from bs4 import BeautifulSoup as _BeautifulSoup 

25 

26from . import __ 

27 

28 

29def html_to_markdown( html_text: str ) -> str: 

30 ''' Converts MkDocs HTML content to clean markdown format. ''' 

31 if not html_text.strip( ): return '' 

32 try: soup = _BeautifulSoup( html_text, 'lxml' ) 

33 except Exception: return html_text 

34 context = _MarkdownContext( ) 

35 result = _convert_element_to_markdown( soup, context ) 

36 return _clean_whitespace( result ) 

37 

38 

39class _MarkdownContext: 

40 ''' Context for tracking state during HTML-to-Markdown conversion. ''' 

41 

42 def __init__( self ) -> None: 

43 self.in_admonition = False 

44 self.admonition_type = '' 

45 

46 

47def _convert_admonition( 

48 element: __.typx.Any, context: _MarkdownContext 

49) -> str: 

50 ''' Converts Material for MkDocs admonition to clean text. ''' 

51 classes = element.get( 'class', [ ] ) 

52 if isinstance( classes, str ): 

53 classes = classes.split( ) 

54 admonition_type = 'Note' 

55 for cls in classes: 

56 if cls in ( 'note', 'info', 'warning', 'danger', 'tip' ): 

57 admonition_type = cls.capitalize( ) 

58 break 

59 old_in_admonition = context.in_admonition 

60 old_admonition_type = context.admonition_type 

61 context.in_admonition = True 

62 context.admonition_type = admonition_type 

63 title_elem = element.find( class_ = 'admonition-title' ) 

64 title = ( 

65 title_elem.get_text( strip = True ) 

66 if title_elem else admonition_type ) 

67 content_parts: list[ str ] = [ ] 

68 for child in element.children: 

69 if ( hasattr( child, 'get' ) 

70 and 'admonition-title' in child.get( 'class', [ ] ) ): 

71 continue 

72 converted = _convert_element_to_markdown( child, context ) 

73 if converted.strip( ): 

74 content_parts.append( converted.strip( ) ) 

75 context.in_admonition = old_in_admonition 

76 context.admonition_type = old_admonition_type 

77 content = ' '.join( content_parts ) 

78 return f"**{title}**: {content}\n\n" if content else '' 

79 

80 

81def _convert_children( 

82 element: __.typx.Any, context: _MarkdownContext 

83) -> str: 

84 ''' Converts all child elements to markdown. ''' 

85 result_parts: list[ str ] = [ ] 

86 for child in element.children: 

87 converted = _convert_element_to_markdown( child, context ) 

88 result_parts.append( converted ) 

89 return ''.join( result_parts ) 

90 

91 

92def _convert_code_block( 

93 element: __.typx.Any, context: _MarkdownContext 

94) -> str: 

95 ''' Converts code block with language detection. ''' 

96 language = _detect_code_language( element ) 

97 code_element = element.find( 'code' ) or element.find( 'pre' ) 

98 code_text = ( 

99 code_element.get_text( ) if code_element else element.get_text( ) ) 

100 code_text = code_text.strip( ) 

101 if not code_text: return '' 

102 if language: return f"```{language}\n{code_text}\n```\n\n" 

103 return f"```\n{code_text}\n```\n\n" 

104 

105 

106def _convert_definition_list( 

107 element: __.typx.Any, context: _MarkdownContext 

108) -> str: 

109 ''' Converts definition list to markdown format. ''' 

110 children = _convert_children( element, context ) 

111 return f"{children}\n" if children.strip( ) else '' 

112 

113 

114def _convert_div( element: __.typx.Any, context: _MarkdownContext ) -> str: 

115 ''' Converts div elements with special handling for MkDocs patterns. ''' 

116 classes = element.get( 'class', [ ] ) 

117 if isinstance( classes, str ): 

118 classes = classes.split( ) 

119 if 'admonition' in classes: 

120 return _convert_admonition( element, context ) 

121 if 'highlight' in classes or 'codehilite' in classes: 

122 return _convert_code_block( element, context ) 

123 if 'superfences' in classes: 

124 return _convert_code_block( element, context ) 

125 children = _convert_children( element, context ) 

126 return f"{children}\n\n" if children.strip( ) else '' 

127 

128 

129def _convert_element_to_markdown( 

130 element: __.typx.Any, context: _MarkdownContext 

131) -> str: 

132 ''' Converts HTML element to markdown using single-pass traversal. ''' 

133 if hasattr( element, 'name' ) and element.name: 

134 return _convert_tag_to_markdown( element, context ) 

135 return str( element ) 

136 

137 

138def _convert_header( element: __.typx.Any ) -> str: 

139 ''' Converts header element to markdown. ''' 

140 text = element.get_text( strip = True ) 

141 if not text: 

142 return '' 

143 level = int( element.name[ 1 ] ) 

144 prefix = '#' * level 

145 return f"{prefix} {text}\n\n" 

146 

147 

148def _convert_inline_code( element: __.typx.Any ) -> str: 

149 ''' Converts inline code element. ''' 

150 text = element.get_text( ) 

151 return f"`{text}`" 

152 

153 

154def _convert_link( element: __.typx.Any, context: _MarkdownContext ) -> str: 

155 ''' Converts anchor element to markdown link. ''' 

156 href = element.get( 'href', '' ) 

157 text = element.get_text( ) 

158 if href and not href.startswith( '#' ): 

159 return f"[{text}]({href})" 

160 return text 

161 

162 

163def _convert_preformatted( 

164 element: __.typx.Any, context: _MarkdownContext 

165) -> str: 

166 ''' Converts preformatted text block. ''' 

167 language = _detect_code_language( element ) 

168 text = element.get_text( ) 

169 if not text.strip( ): 

170 return '' 

171 if language: 

172 return f"```{language}\n{text}\n```\n\n" 

173 return f"```\n{text}\n```\n\n" 

174 

175 

176def _convert_span( element: __.typx.Any, context: _MarkdownContext ) -> str: 

177 ''' Converts span element with special handling for mkdocstrings. ''' 

178 classes = element.get( 'class', [ ] ) 

179 if isinstance( classes, str ): 

180 classes = classes.split( ) 

181 if 'doc-heading' in classes: 

182 children = _convert_children( element, context ) 

183 return f"**{children}**" if children.strip( ) else '' 

184 return _convert_children( element, context ) 

185 

186 

187def _convert_table( element: __.typx.Any, context: _MarkdownContext ) -> str: 

188 ''' Converts HTML table to simple text representation. ''' 

189 rows: list[ str ] = [ ] 

190 for row in element.find_all( 'tr' ): 

191 cells: list[ str ] = [ ] 

192 for cell in row.find_all( [ 'td', 'th' ] ): 

193 cell_text = cell.get_text( strip = True ) 

194 cells.append( cell_text ) 

195 if cells: 

196 rows.append( ' | '.join( cells ) ) 

197 return '\n'.join( rows ) + '\n\n' if rows else '' 

198 

199 

200def _convert_tag_to_markdown( # noqa: C901, PLR0911, PLR0912 

201 element: __.typx.Any, context: _MarkdownContext 

202) -> str: 

203 ''' Converts HTML tag to markdown with MkDocs-specific handling. ''' 

204 if _should_skip_element( element ): return '' 

205 match element.name: 

206 case 'code': return _convert_inline_code( element ) 

207 case 'pre': return _convert_preformatted( element, context ) 

208 case 'strong' | 'b': 

209 children = _convert_children( element, context ) 

210 return f"**{children}**" if children.strip( ) else '' 

211 case 'em' | 'i': 

212 children = _convert_children( element, context ) 

213 return f"*{children}*" if children.strip( ) else '' 

214 case 'a': return _convert_link( element, context ) 

215 case 'span': return _convert_span( element, context ) 

216 case 'div': return _convert_div( element, context ) 

217 case 'p' | 'section' | 'article': 

218 children = _convert_children( element, context ) 

219 return f"{children}\n\n" if children.strip( ) else '' 

220 case 'li': 

221 children = _convert_children( element, context ) 

222 return f"- {children}\n" if children.strip( ) else '' 

223 case 'ul' | 'ol': 

224 children = _convert_children( element, context ) 

225 return f"{children}\n" if children.strip( ) else '' 

226 case 'dl': 

227 return _convert_definition_list( element, context ) 

228 case 'dt': 

229 children = _convert_children( element, context ) 

230 return f"**{children}**" if children.strip( ) else '' 

231 case 'dd': 

232 children = _convert_children( element, context ) 

233 return f": {children}\n" if children.strip( ) else '' 

234 case 'h1' | 'h2' | 'h3' | 'h4' | 'h5' | 'h6': 

235 return _convert_header( element ) 

236 case 'br': return '\n' 

237 case 'table' | 'tr' | 'td' | 'th' | 'thead' | 'tbody': 

238 return _convert_table( element, context ) 

239 case _: 

240 return _convert_children( element, context ) 

241 

242 

243def _clean_whitespace( text: str ) -> str: 

244 ''' Cleans up whitespace while preserving markdown structure. ''' 

245 text = __.re.sub( r' +', ' ', text ) 

246 text = __.re.sub( r'\n +', '\n', text ) 

247 text = __.re.sub( r' +\n', '\n', text ) 

248 text = __.re.sub( r'\n{3,}', '\n\n', text ) 

249 text = __.re.sub( r'^[ \t]+|[ \t]+$', '', text, flags = __.re.MULTILINE ) 

250 return text.strip( ) 

251 

252 

253def _detect_code_language( element: __.typx.Any ) -> str: # noqa: C901, PLR0911 

254 ''' Detects programming language from code block element. ''' 

255 classes = element.get( 'class', [ ] ) 

256 if isinstance( classes, str ): 

257 classes = classes.split( ) 

258 for cls in classes: 

259 if cls.startswith( 'language-' ): 

260 return cls[ 9: ] 

261 if cls.startswith( 'highlight-' ): 

262 return cls[ 10: ] 

263 if cls.startswith( 'lang-' ): 

264 return cls[ 5: ] 

265 if cls in ( 'python', 'javascript', 'typescript', 'bash', 'shell', 

266 'json', 'yaml', 'xml', 'html', 'css', 'sql', 'rust', 

267 'go', 'java', 'cpp', 'c' ): 

268 return cls 

269 code_element = element.find( 'code' ) 

270 if code_element: 

271 code_classes = code_element.get( 'class', [ ] ) 

272 if isinstance( code_classes, str ): 

273 code_classes = code_classes.split( ) 

274 for cls in code_classes: 

275 if cls.startswith( 'language-' ): 

276 return cls[ 9: ] 

277 if cls.startswith( 'highlight-' ): 

278 return cls[ 10: ] 

279 if cls.startswith( 'lang-' ): 

280 return cls[ 5: ] 

281 return '' 

282 

283 

284def _should_skip_element( element: __.typx.Any ) -> bool: 

285 ''' Determines if element should be skipped entirely. ''' 

286 classes = element.get( 'class', [ ] ) 

287 if isinstance( classes, str ): 

288 classes = classes.split( ) 

289 skip_classes = { 

290 'md-nav', 'md-header', 'md-footer', 'md-sidebar', 

291 'headerlink', 'md-clipboard', 'md-top', 

292 'toc', 'navigation', 'skip-link' 

293 } 

294 return ( 

295 any( cls in skip_classes for cls in classes ) 

296 or element.get( 'role' ) in ( 'navigation', 'banner', 'contentinfo' ) 

297 )