Coverage for sources/librovore/structures/mkdocs/conversion.py: 0%

1# vim: set filetype=python fileencoding=utf-8:

2# -*- coding: utf-8 -*-

4#============================================================================#

5# #

6# Licensed under the Apache License, Version 2.0 (the "License"); #

7# you may not use this file except in compliance with the License. #

8# You may obtain a copy of the License at #

9# #

10# http://www.apache.org/licenses/LICENSE-2.0 #

11# #

12# Unless required by applicable law or agreed to in writing, software #

13# distributed under the License is distributed on an "AS IS" BASIS, #

14# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. #

15# See the License for the specific language governing permissions and #

16# limitations under the License. #

17# #

18#============================================================================#

21''' HTML to markdown conversion for MkDocs content. '''

24from bs4 import BeautifulSoup as _BeautifulSoup

26from . import __

29def html_to_markdown( html_text: str ) -> str:

30 ''' Converts MkDocs HTML content to clean markdown format. '''

31 if not html_text.strip( ): return ''

32 try: soup = _BeautifulSoup( html_text, 'lxml' )

33 except Exception: return html_text

34 context = _MarkdownContext( )

35 result = _convert_element_to_markdown( soup, context )

36 return _clean_whitespace( result )

39class _MarkdownContext:

40 ''' Context for tracking state during HTML-to-Markdown conversion. '''

42 def __init__( self ) -> None:

43 self.in_admonition = False

44 self.admonition_type = ''

47def _convert_admonition(

48 element: __.typx.Any, context: _MarkdownContext

49) -> str:

50 ''' Converts Material for MkDocs admonition to clean text. '''

51 classes = element.get( 'class', [ ] )

52 if isinstance( classes, str ):

53 classes = classes.split( )

54 admonition_type = 'Note'

55 for cls in classes:

56 if cls in ( 'note', 'info', 'warning', 'danger', 'tip' ):

57 admonition_type = cls.capitalize( )

58 break

59 old_in_admonition = context.in_admonition

60 old_admonition_type = context.admonition_type

61 context.in_admonition = True

62 context.admonition_type = admonition_type

63 title_elem = element.find( class_ = 'admonition-title' )

64 title = (

65 title_elem.get_text( strip = True )

66 if title_elem else admonition_type )

67 content_parts: list[ str ] = [ ]

68 for child in element.children:

69 if ( hasattr( child, 'get' )

70 and 'admonition-title' in child.get( 'class', [ ] ) ):

71 continue

72 converted = _convert_element_to_markdown( child, context )

73 if converted.strip( ):

74 content_parts.append( converted.strip( ) )

75 context.in_admonition = old_in_admonition

76 context.admonition_type = old_admonition_type

77 content = ' '.join( content_parts )

78 return f"**{title}**: {content}\n\n" if content else ''

81def _convert_children(

82 element: __.typx.Any, context: _MarkdownContext

83) -> str:

84 ''' Converts all child elements to markdown. '''

85 result_parts: list[ str ] = [ ]

86 for child in element.children:

87 converted = _convert_element_to_markdown( child, context )

88 result_parts.append( converted )

89 return ''.join( result_parts )

92def _convert_code_block(

93 element: __.typx.Any, context: _MarkdownContext

94) -> str:

95 ''' Converts code block with language detection. '''

96 language = _detect_code_language( element )

97 code_element = element.find( 'code' ) or element.find( 'pre' )

98 code_text = (

99 code_element.get_text( ) if code_element else element.get_text( ) )

100 code_text = code_text.strip( )

101 if not code_text: return ''

102 if language: return f"```{language}\n{code_text}\n```\n\n"

103 return f"```\n{code_text}\n```\n\n"

104

105

106def _convert_definition_list(

107 element: __.typx.Any, context: _MarkdownContext

108) -> str:

109 ''' Converts definition list to markdown format. '''

110 children = _convert_children( element, context )

111 return f"{children}\n" if children.strip( ) else ''

112

113

114def _convert_div( element: __.typx.Any, context: _MarkdownContext ) -> str:

115 ''' Converts div elements with special handling for MkDocs patterns. '''

116 classes = element.get( 'class', [ ] )

117 if isinstance( classes, str ):

118 classes = classes.split( )

119 if 'admonition' in classes:

120 return _convert_admonition( element, context )

121 if 'highlight' in classes or 'codehilite' in classes:

122 return _convert_code_block( element, context )

123 if 'superfences' in classes:

124 return _convert_code_block( element, context )

125 children = _convert_children( element, context )

126 return f"{children}\n\n" if children.strip( ) else ''

127

128

129def _convert_element_to_markdown(

130 element: __.typx.Any, context: _MarkdownContext

131) -> str:

132 ''' Converts HTML element to markdown using single-pass traversal. '''

133 if hasattr( element, 'name' ) and element.name:

134 return _convert_tag_to_markdown( element, context )

135 return str( element )

136

137

138def _convert_header( element: __.typx.Any ) -> str:

139 ''' Converts header element to markdown. '''

140 text = element.get_text( strip = True )

141 if not text:

142 return ''

143 level = int( element.name[ 1 ] )

144 prefix = '#' * level

145 return f"{prefix} {text}\n\n"

146

147

148def _convert_inline_code( element: __.typx.Any ) -> str:

149 ''' Converts inline code element. '''

150 text = element.get_text( )

151 return f"`{text}`"

152

153

154def _convert_link( element: __.typx.Any, context: _MarkdownContext ) -> str:

155 ''' Converts anchor element to markdown link. '''

156 href = element.get( 'href', '' )

157 text = element.get_text( )

158 if href and not href.startswith( '#' ):

159 return f"[{text}]({href})"

160 return text

161

162

163def _convert_preformatted(

164 element: __.typx.Any, context: _MarkdownContext

165) -> str:

166 ''' Converts preformatted text block. '''

167 language = _detect_code_language( element )

168 text = element.get_text( )

169 if not text.strip( ):

170 return ''

171 if language:

172 return f"```{language}\n{text}\n```\n\n"

173 return f"```\n{text}\n```\n\n"

174

175

176def _convert_span( element: __.typx.Any, context: _MarkdownContext ) -> str:

177 ''' Converts span element with special handling for mkdocstrings. '''

178 classes = element.get( 'class', [ ] )

179 if isinstance( classes, str ):

180 classes = classes.split( )

181 if 'doc-heading' in classes:

182 children = _convert_children( element, context )

183 return f"**{children}**" if children.strip( ) else ''

184 return _convert_children( element, context )

185

186

187def _convert_table( element: __.typx.Any, context: _MarkdownContext ) -> str:

188 ''' Converts HTML table to simple text representation. '''

189 rows: list[ str ] = [ ]

190 for row in element.find_all( 'tr' ):

191 cells: list[ str ] = [ ]

192 for cell in row.find_all( [ 'td', 'th' ] ):

193 cell_text = cell.get_text( strip = True )

194 cells.append( cell_text )

195 if cells:

196 rows.append( ' | '.join( cells ) )

197 return '\n'.join( rows ) + '\n\n' if rows else ''

198

199

200def _convert_tag_to_markdown( # noqa: C901, PLR0911, PLR0912

201 element: __.typx.Any, context: _MarkdownContext

202) -> str:

203 ''' Converts HTML tag to markdown with MkDocs-specific handling. '''

204 if _should_skip_element( element ): return ''

205 match element.name:

206 case 'code': return _convert_inline_code( element )

207 case 'pre': return _convert_preformatted( element, context )

208 case 'strong' | 'b':

209 children = _convert_children( element, context )

210 return f"**{children}**" if children.strip( ) else ''

211 case 'em' | 'i':

212 children = _convert_children( element, context )

213 return f"*{children}*" if children.strip( ) else ''

214 case 'a': return _convert_link( element, context )

215 case 'span': return _convert_span( element, context )

216 case 'div': return _convert_div( element, context )

217 case 'p' | 'section' | 'article':

218 children = _convert_children( element, context )

219 return f"{children}\n\n" if children.strip( ) else ''

220 case 'li':

221 children = _convert_children( element, context )

222 return f"- {children}\n" if children.strip( ) else ''

223 case 'ul' | 'ol':

224 children = _convert_children( element, context )

225 return f"{children}\n" if children.strip( ) else ''

226 case 'dl':

227 return _convert_definition_list( element, context )

228 case 'dt':

229 children = _convert_children( element, context )

230 return f"**{children}**" if children.strip( ) else ''

231 case 'dd':

232 children = _convert_children( element, context )

233 return f": {children}\n" if children.strip( ) else ''

234 case 'h1' | 'h2' | 'h3' | 'h4' | 'h5' | 'h6':

235 return _convert_header( element )

236 case 'br': return '\n'

237 case 'table' | 'tr' | 'td' | 'th' | 'thead' | 'tbody':

238 return _convert_table( element, context )

239 case _:

240 return _convert_children( element, context )

241

242

243def _clean_whitespace( text: str ) -> str:

244 ''' Cleans up whitespace while preserving markdown structure. '''

245 text = __.re.sub( r' +', ' ', text )

246 text = __.re.sub( r'\n +', '\n', text )

247 text = __.re.sub( r' +\n', '\n', text )

248 text = __.re.sub( r'\n{3,}', '\n\n', text )

249 text = __.re.sub( r'^[ \t]+|[ \t]+$', '', text, flags = __.re.MULTILINE )

250 return text.strip( )

251

252

253def _detect_code_language( element: __.typx.Any ) -> str: # noqa: C901, PLR0911

254 ''' Detects programming language from code block element. '''

255 classes = element.get( 'class', [ ] )

256 if isinstance( classes, str ):

257 classes = classes.split( )

258 for cls in classes:

259 if cls.startswith( 'language-' ):

260 return cls[ 9: ]

261 if cls.startswith( 'highlight-' ):

262 return cls[ 10: ]

263 if cls.startswith( 'lang-' ):

264 return cls[ 5: ]

265 if cls in ( 'python', 'javascript', 'typescript', 'bash', 'shell',

266 'json', 'yaml', 'xml', 'html', 'css', 'sql', 'rust',

267 'go', 'java', 'cpp', 'c' ):

268 return cls

269 code_element = element.find( 'code' )

270 if code_element:

271 code_classes = code_element.get( 'class', [ ] )

272 if isinstance( code_classes, str ):

273 code_classes = code_classes.split( )

274 for cls in code_classes:

275 if cls.startswith( 'language-' ):

276 return cls[ 9: ]

277 if cls.startswith( 'highlight-' ):

278 return cls[ 10: ]

279 if cls.startswith( 'lang-' ):

280 return cls[ 5: ]

281 return ''

282

283

284def _should_skip_element( element: __.typx.Any ) -> bool:

285 ''' Determines if element should be skipped entirely. '''

286 classes = element.get( 'class', [ ] )

287 if isinstance( classes, str ):

288 classes = classes.split( )

289 skip_classes = {

290 'md-nav', 'md-header', 'md-footer', 'md-sidebar',

291 'headerlink', 'md-clipboard', 'md-top',

292 'toc', 'navigation', 'skip-link'

293 }

294 return (

295 any( cls in skip_classes for cls in classes )

296 or element.get( 'role' ) in ( 'navigation', 'banner', 'contentinfo' )

297 )