Coverage for sources/librovore/structures/mkdocs/conversion.py: 0%

168 statements  

« prev     ^ index     » next       coverage.py v7.10.7, created at 2025-09-28 22:09 +0000

1# vim: set filetype=python fileencoding=utf-8: 

2# -*- coding: utf-8 -*- 

3 

4#============================================================================# 

5# # 

6# Licensed under the Apache License, Version 2.0 (the "License"); # 

7# you may not use this file except in compliance with the License. # 

8# You may obtain a copy of the License at # 

9# # 

10# http://www.apache.org/licenses/LICENSE-2.0 # 

11# # 

12# Unless required by applicable law or agreed to in writing, software # 

13# distributed under the License is distributed on an "AS IS" BASIS, # 

14# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # 

15# See the License for the specific language governing permissions and # 

16# limitations under the License. # 

17# # 

18#============================================================================# 

19 

20 

21''' HTML to markdown conversion for MkDocs content. ''' 

22 

23 

24from bs4 import BeautifulSoup as _BeautifulSoup 

25 

26from . import __ 

27from .converters import extract_code_language as _extract_code_language 

28 

29 

30def html_to_markdown( html_text: str ) -> str: 

31 ''' Converts MkDocs HTML content to clean markdown format. ''' 

32 if not html_text.strip( ): return '' 

33 try: soup = _BeautifulSoup( html_text, 'lxml' ) 

34 except Exception: return html_text 

35 context = _MarkdownContext( ) 

36 result = _convert_element_to_markdown( soup, context ) 

37 return _clean_whitespace( result ) 

38 

39 

40class _MarkdownContext: 

41 ''' Context for tracking state during HTML-to-Markdown conversion. ''' 

42 

43 def __init__( self ) -> None: 

44 self.in_admonition = False 

45 self.admonition_type = '' 

46 

47 

48def _convert_admonition( 

49 element: __.typx.Any, context: _MarkdownContext 

50) -> str: 

51 ''' Converts Material for MkDocs admonition to clean text. ''' 

52 classes = element.get( 'class', [ ] ) 

53 if isinstance( classes, str ): 

54 classes = classes.split( ) 

55 admonition_type = 'Note' 

56 for cls in classes: 

57 if cls in ( 'note', 'info', 'warning', 'danger', 'tip' ): 

58 admonition_type = cls.capitalize( ) 

59 break 

60 old_in_admonition = context.in_admonition 

61 old_admonition_type = context.admonition_type 

62 context.in_admonition = True 

63 context.admonition_type = admonition_type 

64 title_elem = element.find( class_ = 'admonition-title' ) 

65 title = ( 

66 title_elem.get_text( strip = True ) 

67 if title_elem else admonition_type ) 

68 content_parts: list[ str ] = [ ] 

69 for child in element.children: 

70 if ( hasattr( child, 'get' ) 

71 and 'admonition-title' in child.get( 'class', [ ] ) ): 

72 continue 

73 converted = _convert_element_to_markdown( child, context ) 

74 if converted.strip( ): 

75 content_parts.append( converted.strip( ) ) 

76 context.in_admonition = old_in_admonition 

77 context.admonition_type = old_admonition_type 

78 content = ' '.join( content_parts ) 

79 return f"**{title}**: {content}\n\n" if content else '' 

80 

81 

82def _convert_children( 

83 element: __.typx.Any, context: _MarkdownContext 

84) -> str: 

85 ''' Converts all child elements to markdown. ''' 

86 result_parts: list[ str ] = [ ] 

87 for child in element.children: 

88 converted = _convert_element_to_markdown( child, context ) 

89 result_parts.append( converted ) 

90 return ''.join( result_parts ) 

91 

92 

93def _convert_code_block( 

94 element: __.typx.Any, context: _MarkdownContext 

95) -> str: 

96 ''' Converts code block with language detection. ''' 

97 language = _detect_code_language( element ) 

98 code_element = element.find( 'code' ) or element.find( 'pre' ) 

99 code_text = ( 

100 code_element.get_text( ) if code_element else element.get_text( ) ) 

101 code_text = code_text.strip( ) 

102 if not code_text: return '' 

103 if language: return f"```{language}\n{code_text}\n```\n\n" 

104 return f"```\n{code_text}\n```\n\n" 

105 

106 

107def _convert_definition_list( 

108 element: __.typx.Any, context: _MarkdownContext 

109) -> str: 

110 ''' Converts definition list to markdown format. ''' 

111 children = _convert_children( element, context ) 

112 return f"{children}\n" if children.strip( ) else '' 

113 

114 

115def _convert_div( element: __.typx.Any, context: _MarkdownContext ) -> str: 

116 ''' Converts div elements with special handling for MkDocs patterns. ''' 

117 classes = element.get( 'class', [ ] ) 

118 if isinstance( classes, str ): 

119 classes = classes.split( ) 

120 if 'admonition' in classes: 

121 return _convert_admonition( element, context ) 

122 if 'highlight' in classes or 'codehilite' in classes: 

123 return _convert_code_block( element, context ) 

124 if 'superfences' in classes: 

125 return _convert_code_block( element, context ) 

126 children = _convert_children( element, context ) 

127 return f"{children}\n\n" if children.strip( ) else '' 

128 

129 

130def _convert_element_to_markdown( 

131 element: __.typx.Any, context: _MarkdownContext 

132) -> str: 

133 ''' Converts HTML element to markdown using single-pass traversal. ''' 

134 if hasattr( element, 'name' ) and element.name: 

135 return _convert_tag_to_markdown( element, context ) 

136 return str( element ) 

137 

138 

139def _convert_header( element: __.typx.Any ) -> str: 

140 ''' Converts header element to markdown. ''' 

141 text = element.get_text( strip = True ) 

142 if not text: 

143 return '' 

144 level = int( element.name[ 1 ] ) 

145 prefix = '#' * level 

146 return f"{prefix} {text}\n\n" 

147 

148 

149def _convert_inline_code( element: __.typx.Any ) -> str: 

150 ''' Converts inline code element. ''' 

151 text = element.get_text( ) 

152 return f"`{text}`" 

153 

154 

155def _convert_link( element: __.typx.Any, context: _MarkdownContext ) -> str: 

156 ''' Converts anchor element to markdown link. ''' 

157 href = element.get( 'href', '' ) 

158 text = element.get_text( ) 

159 if href and not href.startswith( '#' ): 

160 return f"[{text}]({href})" 

161 return text 

162 

163 

164def _convert_preformatted( 

165 element: __.typx.Any, context: _MarkdownContext 

166) -> str: 

167 ''' Converts preformatted text block. ''' 

168 language = _detect_code_language( element ) 

169 text = element.get_text( ) 

170 if not text.strip( ): 

171 return '' 

172 if language: 

173 return f"```{language}\n{text}\n```\n\n" 

174 return f"```\n{text}\n```\n\n" 

175 

176 

177def _convert_span( element: __.typx.Any, context: _MarkdownContext ) -> str: 

178 ''' Converts span element with special handling for mkdocstrings. ''' 

179 classes = element.get( 'class', [ ] ) 

180 if isinstance( classes, str ): 

181 classes = classes.split( ) 

182 if 'doc-heading' in classes: 

183 children = _convert_children( element, context ) 

184 return f"**{children}**" if children.strip( ) else '' 

185 return _convert_children( element, context ) 

186 

187 

188def _convert_table( element: __.typx.Any, context: _MarkdownContext ) -> str: 

189 ''' Converts HTML table to simple text representation. ''' 

190 rows: list[ str ] = [ ] 

191 for row in element.find_all( 'tr' ): 

192 cells: list[ str ] = [ ] 

193 for cell in row.find_all( [ 'td', 'th' ] ): 

194 cell_text = cell.get_text( strip = True ) 

195 cells.append( cell_text ) 

196 if cells: 

197 rows.append( ' | '.join( cells ) ) 

198 return '\n'.join( rows ) + '\n\n' if rows else '' 

199 

200 

201def _convert_tag_to_markdown( # noqa: C901, PLR0911, PLR0912 

202 element: __.typx.Any, context: _MarkdownContext 

203) -> str: 

204 ''' Converts HTML tag to markdown with MkDocs-specific handling. ''' 

205 if _should_skip_element( element ): return '' 

206 match element.name: 

207 case 'code': return _convert_inline_code( element ) 

208 case 'pre': return _convert_preformatted( element, context ) 

209 case 'strong' | 'b': 

210 children = _convert_children( element, context ) 

211 return f"**{children}**" if children.strip( ) else '' 

212 case 'em' | 'i': 

213 children = _convert_children( element, context ) 

214 return f"*{children}*" if children.strip( ) else '' 

215 case 'a': return _convert_link( element, context ) 

216 case 'span': return _convert_span( element, context ) 

217 case 'div': return _convert_div( element, context ) 

218 case 'p' | 'section' | 'article': 

219 children = _convert_children( element, context ) 

220 return f"{children}\n\n" if children.strip( ) else '' 

221 case 'li': 

222 children = _convert_children( element, context ) 

223 return f"- {children}\n" if children.strip( ) else '' 

224 case 'ul' | 'ol': 

225 children = _convert_children( element, context ) 

226 return f"{children}\n" if children.strip( ) else '' 

227 case 'dl': 

228 return _convert_definition_list( element, context ) 

229 case 'dt': 

230 children = _convert_children( element, context ) 

231 return f"**{children}**" if children.strip( ) else '' 

232 case 'dd': 

233 children = _convert_children( element, context ) 

234 return f": {children}\n" if children.strip( ) else '' 

235 case 'h1' | 'h2' | 'h3' | 'h4' | 'h5' | 'h6': 

236 return _convert_header( element ) 

237 case 'br': return '\n' 

238 case 'table' | 'tr' | 'td' | 'th' | 'thead' | 'tbody': 

239 return _convert_table( element, context ) 

240 case _: 

241 return _convert_children( element, context ) 

242 

243 

244def _clean_whitespace( text: str ) -> str: 

245 ''' Cleans up whitespace while preserving markdown structure. ''' 

246 text = __.re.sub( r' +', ' ', text ) 

247 text = __.re.sub( r'\n +', '\n', text ) 

248 text = __.re.sub( r' +\n', '\n', text ) 

249 text = __.re.sub( r'\n{3,}', '\n\n', text ) 

250 text = __.re.sub( r'^[ \t]+|[ \t]+$', '', text, flags = __.re.MULTILINE ) 

251 return text.strip( ) 

252 

253 

254def _detect_code_language( element: __.typx.Any ) -> str: 

255 ''' Detects programming language using universal patterns. ''' 

256 return _extract_code_language( element ) 

257 

258 

259def _should_skip_element( element: __.typx.Any ) -> bool: 

260 ''' Determines if element should be skipped entirely. ''' 

261 classes = element.get( 'class', [ ] ) 

262 if isinstance( classes, str ): 

263 classes = classes.split( ) 

264 skip_classes = { 

265 'md-nav', 'md-header', 'md-footer', 'md-sidebar', 

266 'headerlink', 'md-clipboard', 'md-top', 

267 'toc', 'navigation', 'skip-link' 

268 } 

269 return ( 

270 any( cls in skip_classes for cls in classes ) 

271 or element.get( 'role' ) in ( 'navigation', 'banner', 'contentinfo' ) 

272 )