Coverage for sources/librovore/structures/mkdocs/conversion.py: 0%
191 statements
« prev ^ index » next coverage.py v7.10.6, created at 2025-09-03 21:59 +0000
« prev ^ index » next coverage.py v7.10.6, created at 2025-09-03 21:59 +0000
1# vim: set filetype=python fileencoding=utf-8:
2# -*- coding: utf-8 -*-
4#============================================================================#
5# #
6# Licensed under the Apache License, Version 2.0 (the "License"); #
7# you may not use this file except in compliance with the License. #
8# You may obtain a copy of the License at #
9# #
10# http://www.apache.org/licenses/LICENSE-2.0 #
11# #
12# Unless required by applicable law or agreed to in writing, software #
13# distributed under the License is distributed on an "AS IS" BASIS, #
14# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. #
15# See the License for the specific language governing permissions and #
16# limitations under the License. #
17# #
18#============================================================================#
21''' HTML to markdown conversion for MkDocs content. '''
24from bs4 import BeautifulSoup as _BeautifulSoup
26from . import __
29def html_to_markdown( html_text: str ) -> str:
30 ''' Converts MkDocs HTML content to clean markdown format. '''
31 if not html_text.strip( ): return ''
32 try: soup = _BeautifulSoup( html_text, 'lxml' )
33 except Exception: return html_text
34 context = _MarkdownContext( )
35 result = _convert_element_to_markdown( soup, context )
36 return _clean_whitespace( result )
39class _MarkdownContext:
40 ''' Context for tracking state during HTML-to-Markdown conversion. '''
42 def __init__( self ) -> None:
43 self.in_admonition = False
44 self.admonition_type = ''
47def _convert_admonition(
48 element: __.typx.Any, context: _MarkdownContext
49) -> str:
50 ''' Converts Material for MkDocs admonition to clean text. '''
51 classes = element.get( 'class', [ ] )
52 if isinstance( classes, str ):
53 classes = classes.split( )
54 admonition_type = 'Note'
55 for cls in classes:
56 if cls in ( 'note', 'info', 'warning', 'danger', 'tip' ):
57 admonition_type = cls.capitalize( )
58 break
59 old_in_admonition = context.in_admonition
60 old_admonition_type = context.admonition_type
61 context.in_admonition = True
62 context.admonition_type = admonition_type
63 title_elem = element.find( class_ = 'admonition-title' )
64 title = (
65 title_elem.get_text( strip = True )
66 if title_elem else admonition_type )
67 content_parts: list[ str ] = [ ]
68 for child in element.children:
69 if ( hasattr( child, 'get' )
70 and 'admonition-title' in child.get( 'class', [ ] ) ):
71 continue
72 converted = _convert_element_to_markdown( child, context )
73 if converted.strip( ):
74 content_parts.append( converted.strip( ) )
75 context.in_admonition = old_in_admonition
76 context.admonition_type = old_admonition_type
77 content = ' '.join( content_parts )
78 return f"**{title}**: {content}\n\n" if content else ''
81def _convert_children(
82 element: __.typx.Any, context: _MarkdownContext
83) -> str:
84 ''' Converts all child elements to markdown. '''
85 result_parts: list[ str ] = [ ]
86 for child in element.children:
87 converted = _convert_element_to_markdown( child, context )
88 result_parts.append( converted )
89 return ''.join( result_parts )
92def _convert_code_block(
93 element: __.typx.Any, context: _MarkdownContext
94) -> str:
95 ''' Converts code block with language detection. '''
96 language = _detect_code_language( element )
97 code_element = element.find( 'code' ) or element.find( 'pre' )
98 code_text = (
99 code_element.get_text( ) if code_element else element.get_text( ) )
100 code_text = code_text.strip( )
101 if not code_text: return ''
102 if language: return f"```{language}\n{code_text}\n```\n\n"
103 return f"```\n{code_text}\n```\n\n"
106def _convert_definition_list(
107 element: __.typx.Any, context: _MarkdownContext
108) -> str:
109 ''' Converts definition list to markdown format. '''
110 children = _convert_children( element, context )
111 return f"{children}\n" if children.strip( ) else ''
114def _convert_div( element: __.typx.Any, context: _MarkdownContext ) -> str:
115 ''' Converts div elements with special handling for MkDocs patterns. '''
116 classes = element.get( 'class', [ ] )
117 if isinstance( classes, str ):
118 classes = classes.split( )
119 if 'admonition' in classes:
120 return _convert_admonition( element, context )
121 if 'highlight' in classes or 'codehilite' in classes:
122 return _convert_code_block( element, context )
123 if 'superfences' in classes:
124 return _convert_code_block( element, context )
125 children = _convert_children( element, context )
126 return f"{children}\n\n" if children.strip( ) else ''
129def _convert_element_to_markdown(
130 element: __.typx.Any, context: _MarkdownContext
131) -> str:
132 ''' Converts HTML element to markdown using single-pass traversal. '''
133 if hasattr( element, 'name' ) and element.name:
134 return _convert_tag_to_markdown( element, context )
135 return str( element )
138def _convert_header( element: __.typx.Any ) -> str:
139 ''' Converts header element to markdown. '''
140 text = element.get_text( strip = True )
141 if not text:
142 return ''
143 level = int( element.name[ 1 ] )
144 prefix = '#' * level
145 return f"{prefix} {text}\n\n"
148def _convert_inline_code( element: __.typx.Any ) -> str:
149 ''' Converts inline code element. '''
150 text = element.get_text( )
151 return f"`{text}`"
154def _convert_link( element: __.typx.Any, context: _MarkdownContext ) -> str:
155 ''' Converts anchor element to markdown link. '''
156 href = element.get( 'href', '' )
157 text = element.get_text( )
158 if href and not href.startswith( '#' ):
159 return f"[{text}]({href})"
160 return text
163def _convert_preformatted(
164 element: __.typx.Any, context: _MarkdownContext
165) -> str:
166 ''' Converts preformatted text block. '''
167 language = _detect_code_language( element )
168 text = element.get_text( )
169 if not text.strip( ):
170 return ''
171 if language:
172 return f"```{language}\n{text}\n```\n\n"
173 return f"```\n{text}\n```\n\n"
176def _convert_span( element: __.typx.Any, context: _MarkdownContext ) -> str:
177 ''' Converts span element with special handling for mkdocstrings. '''
178 classes = element.get( 'class', [ ] )
179 if isinstance( classes, str ):
180 classes = classes.split( )
181 if 'doc-heading' in classes:
182 children = _convert_children( element, context )
183 return f"**{children}**" if children.strip( ) else ''
184 return _convert_children( element, context )
187def _convert_table( element: __.typx.Any, context: _MarkdownContext ) -> str:
188 ''' Converts HTML table to simple text representation. '''
189 rows: list[ str ] = [ ]
190 for row in element.find_all( 'tr' ):
191 cells: list[ str ] = [ ]
192 for cell in row.find_all( [ 'td', 'th' ] ):
193 cell_text = cell.get_text( strip = True )
194 cells.append( cell_text )
195 if cells:
196 rows.append( ' | '.join( cells ) )
197 return '\n'.join( rows ) + '\n\n' if rows else ''
200def _convert_tag_to_markdown( # noqa: C901, PLR0911, PLR0912
201 element: __.typx.Any, context: _MarkdownContext
202) -> str:
203 ''' Converts HTML tag to markdown with MkDocs-specific handling. '''
204 if _should_skip_element( element ): return ''
205 match element.name:
206 case 'code': return _convert_inline_code( element )
207 case 'pre': return _convert_preformatted( element, context )
208 case 'strong' | 'b':
209 children = _convert_children( element, context )
210 return f"**{children}**" if children.strip( ) else ''
211 case 'em' | 'i':
212 children = _convert_children( element, context )
213 return f"*{children}*" if children.strip( ) else ''
214 case 'a': return _convert_link( element, context )
215 case 'span': return _convert_span( element, context )
216 case 'div': return _convert_div( element, context )
217 case 'p' | 'section' | 'article':
218 children = _convert_children( element, context )
219 return f"{children}\n\n" if children.strip( ) else ''
220 case 'li':
221 children = _convert_children( element, context )
222 return f"- {children}\n" if children.strip( ) else ''
223 case 'ul' | 'ol':
224 children = _convert_children( element, context )
225 return f"{children}\n" if children.strip( ) else ''
226 case 'dl':
227 return _convert_definition_list( element, context )
228 case 'dt':
229 children = _convert_children( element, context )
230 return f"**{children}**" if children.strip( ) else ''
231 case 'dd':
232 children = _convert_children( element, context )
233 return f": {children}\n" if children.strip( ) else ''
234 case 'h1' | 'h2' | 'h3' | 'h4' | 'h5' | 'h6':
235 return _convert_header( element )
236 case 'br': return '\n'
237 case 'table' | 'tr' | 'td' | 'th' | 'thead' | 'tbody':
238 return _convert_table( element, context )
239 case _:
240 return _convert_children( element, context )
243def _clean_whitespace( text: str ) -> str:
244 ''' Cleans up whitespace while preserving markdown structure. '''
245 text = __.re.sub( r' +', ' ', text )
246 text = __.re.sub( r'\n +', '\n', text )
247 text = __.re.sub( r' +\n', '\n', text )
248 text = __.re.sub( r'\n{3,}', '\n\n', text )
249 text = __.re.sub( r'^[ \t]+|[ \t]+$', '', text, flags = __.re.MULTILINE )
250 return text.strip( )
253def _detect_code_language( element: __.typx.Any ) -> str: # noqa: C901, PLR0911
254 ''' Detects programming language from code block element. '''
255 classes = element.get( 'class', [ ] )
256 if isinstance( classes, str ):
257 classes = classes.split( )
258 for cls in classes:
259 if cls.startswith( 'language-' ):
260 return cls[ 9: ]
261 if cls.startswith( 'highlight-' ):
262 return cls[ 10: ]
263 if cls.startswith( 'lang-' ):
264 return cls[ 5: ]
265 if cls in ( 'python', 'javascript', 'typescript', 'bash', 'shell',
266 'json', 'yaml', 'xml', 'html', 'css', 'sql', 'rust',
267 'go', 'java', 'cpp', 'c' ):
268 return cls
269 code_element = element.find( 'code' )
270 if code_element:
271 code_classes = code_element.get( 'class', [ ] )
272 if isinstance( code_classes, str ):
273 code_classes = code_classes.split( )
274 for cls in code_classes:
275 if cls.startswith( 'language-' ):
276 return cls[ 9: ]
277 if cls.startswith( 'highlight-' ):
278 return cls[ 10: ]
279 if cls.startswith( 'lang-' ):
280 return cls[ 5: ]
281 return ''
284def _should_skip_element( element: __.typx.Any ) -> bool:
285 ''' Determines if element should be skipped entirely. '''
286 classes = element.get( 'class', [ ] )
287 if isinstance( classes, str ):
288 classes = classes.split( )
289 skip_classes = {
290 'md-nav', 'md-header', 'md-footer', 'md-sidebar',
291 'headerlink', 'md-clipboard', 'md-top',
292 'toc', 'navigation', 'skip-link'
293 }
294 return (
295 any( cls in skip_classes for cls in classes )
296 or element.get( 'role' ) in ( 'navigation', 'banner', 'contentinfo' )
297 )