Coverage for sources/librovore/structures/mkdocs/conversion.py: 0%

1# vim: set filetype=python fileencoding=utf-8:

2# -*- coding: utf-8 -*-

4#============================================================================#

5# #

6# Licensed under the Apache License, Version 2.0 (the "License"); #

7# you may not use this file except in compliance with the License. #

8# You may obtain a copy of the License at #

9# #

10# http://www.apache.org/licenses/LICENSE-2.0 #

11# #

12# Unless required by applicable law or agreed to in writing, software #

13# distributed under the License is distributed on an "AS IS" BASIS, #

14# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. #

15# See the License for the specific language governing permissions and #

16# limitations under the License. #

17# #

18#============================================================================#

21''' HTML to markdown conversion for MkDocs content. '''

24from bs4 import BeautifulSoup as _BeautifulSoup

26from . import __

27from .converters import extract_code_language as _extract_code_language

30def html_to_markdown( html_text: str ) -> str:

31 ''' Converts MkDocs HTML content to clean markdown format. '''

32 if not html_text.strip( ): return ''

33 try: soup = _BeautifulSoup( html_text, 'lxml' )

34 except Exception: return html_text

35 context = _MarkdownContext( )

36 result = _convert_element_to_markdown( soup, context )

37 return _clean_whitespace( result )

40class _MarkdownContext:

41 ''' Context for tracking state during HTML-to-Markdown conversion. '''

43 def __init__( self ) -> None:

44 self.in_admonition = False

45 self.admonition_type = ''

48def _convert_admonition(

49 element: __.typx.Any, context: _MarkdownContext

50) -> str:

51 ''' Converts Material for MkDocs admonition to clean text. '''

52 classes = element.get( 'class', [ ] )

53 if isinstance( classes, str ):

54 classes = classes.split( )

55 admonition_type = 'Note'

56 for cls in classes:

57 if cls in ( 'note', 'info', 'warning', 'danger', 'tip' ):

58 admonition_type = cls.capitalize( )

59 break

60 old_in_admonition = context.in_admonition

61 old_admonition_type = context.admonition_type

62 context.in_admonition = True

63 context.admonition_type = admonition_type

64 title_elem = element.find( class_ = 'admonition-title' )

65 title = (

66 title_elem.get_text( strip = True )

67 if title_elem else admonition_type )

68 content_parts: list[ str ] = [ ]

69 for child in element.children:

70 if ( hasattr( child, 'get' )

71 and 'admonition-title' in child.get( 'class', [ ] ) ):

72 continue

73 converted = _convert_element_to_markdown( child, context )

74 if converted.strip( ):

75 content_parts.append( converted.strip( ) )

76 context.in_admonition = old_in_admonition

77 context.admonition_type = old_admonition_type

78 content = ' '.join( content_parts )

79 return f"**{title}**: {content}\n\n" if content else ''

82def _convert_children(

83 element: __.typx.Any, context: _MarkdownContext

84) -> str:

85 ''' Converts all child elements to markdown. '''

86 result_parts: list[ str ] = [ ]

87 for child in element.children:

88 converted = _convert_element_to_markdown( child, context )

89 result_parts.append( converted )

90 return ''.join( result_parts )

93def _convert_code_block(

94 element: __.typx.Any, context: _MarkdownContext

95) -> str:

96 ''' Converts code block with language detection. '''

97 language = _detect_code_language( element )

98 code_element = element.find( 'code' ) or element.find( 'pre' )

99 code_text = (

100 code_element.get_text( ) if code_element else element.get_text( ) )

101 code_text = code_text.strip( )

102 if not code_text: return ''

103 if language: return f"```{language}\n{code_text}\n```\n\n"

104 return f"```\n{code_text}\n```\n\n"

105

106

107def _convert_definition_list(

108 element: __.typx.Any, context: _MarkdownContext

109) -> str:

110 ''' Converts definition list to markdown format. '''

111 children = _convert_children( element, context )

112 return f"{children}\n" if children.strip( ) else ''

113

114

115def _convert_div( element: __.typx.Any, context: _MarkdownContext ) -> str:

116 ''' Converts div elements with special handling for MkDocs patterns. '''

117 classes = element.get( 'class', [ ] )

118 if isinstance( classes, str ):

119 classes = classes.split( )

120 if 'admonition' in classes:

121 return _convert_admonition( element, context )

122 if 'highlight' in classes or 'codehilite' in classes:

123 return _convert_code_block( element, context )

124 if 'superfences' in classes:

125 return _convert_code_block( element, context )

126 children = _convert_children( element, context )

127 return f"{children}\n\n" if children.strip( ) else ''

128

129

130def _convert_element_to_markdown(

131 element: __.typx.Any, context: _MarkdownContext

132) -> str:

133 ''' Converts HTML element to markdown using single-pass traversal. '''

134 if hasattr( element, 'name' ) and element.name:

135 return _convert_tag_to_markdown( element, context )

136 return str( element )

137

138

139def _convert_header( element: __.typx.Any ) -> str:

140 ''' Converts header element to markdown. '''

141 text = element.get_text( strip = True )

142 if not text:

143 return ''

144 level = int( element.name[ 1 ] )

145 prefix = '#' * level

146 return f"{prefix} {text}\n\n"

147

148

149def _convert_inline_code( element: __.typx.Any ) -> str:

150 ''' Converts inline code element. '''

151 text = element.get_text( )

152 return f"`{text}`"

153

154

155def _convert_link( element: __.typx.Any, context: _MarkdownContext ) -> str:

156 ''' Converts anchor element to markdown link. '''

157 href = element.get( 'href', '' )

158 text = element.get_text( )

159 if href and not href.startswith( '#' ):

160 return f"[{text}]({href})"

161 return text

162

163

164def _convert_preformatted(

165 element: __.typx.Any, context: _MarkdownContext

166) -> str:

167 ''' Converts preformatted text block. '''

168 language = _detect_code_language( element )

169 text = element.get_text( )

170 if not text.strip( ):

171 return ''

172 if language:

173 return f"```{language}\n{text}\n```\n\n"

174 return f"```\n{text}\n```\n\n"

175

176

177def _convert_span( element: __.typx.Any, context: _MarkdownContext ) -> str:

178 ''' Converts span element with special handling for mkdocstrings. '''

179 classes = element.get( 'class', [ ] )

180 if isinstance( classes, str ):

181 classes = classes.split( )

182 if 'doc-heading' in classes:

183 children = _convert_children( element, context )

184 return f"**{children}**" if children.strip( ) else ''

185 return _convert_children( element, context )

186

187

188def _convert_table( element: __.typx.Any, context: _MarkdownContext ) -> str:

189 ''' Converts HTML table to simple text representation. '''

190 rows: list[ str ] = [ ]

191 for row in element.find_all( 'tr' ):

192 cells: list[ str ] = [ ]

193 for cell in row.find_all( [ 'td', 'th' ] ):

194 cell_text = cell.get_text( strip = True )

195 cells.append( cell_text )

196 if cells:

197 rows.append( ' | '.join( cells ) )

198 return '\n'.join( rows ) + '\n\n' if rows else ''

199

200

201def _convert_tag_to_markdown( # noqa: C901, PLR0911, PLR0912

202 element: __.typx.Any, context: _MarkdownContext

203) -> str:

204 ''' Converts HTML tag to markdown with MkDocs-specific handling. '''

205 if _should_skip_element( element ): return ''

206 match element.name:

207 case 'code': return _convert_inline_code( element )

208 case 'pre': return _convert_preformatted( element, context )

209 case 'strong' | 'b':

210 children = _convert_children( element, context )

211 return f"**{children}**" if children.strip( ) else ''

212 case 'em' | 'i':

213 children = _convert_children( element, context )

214 return f"*{children}*" if children.strip( ) else ''

215 case 'a': return _convert_link( element, context )

216 case 'span': return _convert_span( element, context )

217 case 'div': return _convert_div( element, context )

218 case 'p' | 'section' | 'article':

219 children = _convert_children( element, context )

220 return f"{children}\n\n" if children.strip( ) else ''

221 case 'li':

222 children = _convert_children( element, context )

223 return f"- {children}\n" if children.strip( ) else ''

224 case 'ul' | 'ol':

225 children = _convert_children( element, context )

226 return f"{children}\n" if children.strip( ) else ''

227 case 'dl':

228 return _convert_definition_list( element, context )

229 case 'dt':

230 children = _convert_children( element, context )

231 return f"**{children}**" if children.strip( ) else ''

232 case 'dd':

233 children = _convert_children( element, context )

234 return f": {children}\n" if children.strip( ) else ''

235 case 'h1' | 'h2' | 'h3' | 'h4' | 'h5' | 'h6':

236 return _convert_header( element )

237 case 'br': return '\n'

238 case 'table' | 'tr' | 'td' | 'th' | 'thead' | 'tbody':

239 return _convert_table( element, context )

240 case _:

241 return _convert_children( element, context )

242

243

244def _clean_whitespace( text: str ) -> str:

245 ''' Cleans up whitespace while preserving markdown structure. '''

246 text = __.re.sub( r' +', ' ', text )

247 text = __.re.sub( r'\n +', '\n', text )

248 text = __.re.sub( r' +\n', '\n', text )

249 text = __.re.sub( r'\n{3,}', '\n\n', text )

250 text = __.re.sub( r'^[ \t]+|[ \t]+$', '', text, flags = __.re.MULTILINE )

251 return text.strip( )

252

253

254def _detect_code_language( element: __.typx.Any ) -> str:

255 ''' Detects programming language using universal patterns. '''

256 return _extract_code_language( element )

257

258

259def _should_skip_element( element: __.typx.Any ) -> bool:

260 ''' Determines if element should be skipped entirely. '''

261 classes = element.get( 'class', [ ] )

262 if isinstance( classes, str ):

263 classes = classes.split( )

264 skip_classes = {

265 'md-nav', 'md-header', 'md-footer', 'md-sidebar',

266 'headerlink', 'md-clipboard', 'md-top',

267 'toc', 'navigation', 'skip-link'

268 }

269 return (

270 any( cls in skip_classes for cls in classes )

271 or element.get( 'role' ) in ( 'navigation', 'banner', 'contentinfo' )

272 )