Coverage for sources/librovore/structures/mkdocs/conversion.py: 0%
168 statements
« prev ^ index » next coverage.py v7.11.0, created at 2025-10-20 18:40 +0000
« prev ^ index » next coverage.py v7.11.0, created at 2025-10-20 18:40 +0000
1# vim: set filetype=python fileencoding=utf-8:
2# -*- coding: utf-8 -*-
4#============================================================================#
5# #
6# Licensed under the Apache License, Version 2.0 (the "License"); #
7# you may not use this file except in compliance with the License. #
8# You may obtain a copy of the License at #
9# #
10# http://www.apache.org/licenses/LICENSE-2.0 #
11# #
12# Unless required by applicable law or agreed to in writing, software #
13# distributed under the License is distributed on an "AS IS" BASIS, #
14# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. #
15# See the License for the specific language governing permissions and #
16# limitations under the License. #
17# #
18#============================================================================#
21''' HTML to markdown conversion for MkDocs content. '''
24from bs4 import BeautifulSoup as _BeautifulSoup
26from . import __
27from .converters import extract_code_language as _extract_code_language
30def html_to_markdown( html_text: str ) -> str:
31 ''' Converts MkDocs HTML content to clean markdown format. '''
32 if not html_text.strip( ): return ''
33 try: soup = _BeautifulSoup( html_text, 'lxml' )
34 except Exception: return html_text
35 context = _MarkdownContext( )
36 result = _convert_element_to_markdown( soup, context )
37 return _clean_whitespace( result )
40class _MarkdownContext:
41 ''' Context for tracking state during HTML-to-Markdown conversion. '''
43 def __init__( self ) -> None:
44 self.in_admonition = False
45 self.admonition_type = ''
48def _convert_admonition(
49 element: __.typx.Any, context: _MarkdownContext
50) -> str:
51 ''' Converts Material for MkDocs admonition to clean text. '''
52 classes = element.get( 'class', [ ] )
53 if isinstance( classes, str ):
54 classes = classes.split( )
55 admonition_type = 'Note'
56 for cls in classes:
57 if cls in ( 'note', 'info', 'warning', 'danger', 'tip' ):
58 admonition_type = cls.capitalize( )
59 break
60 old_in_admonition = context.in_admonition
61 old_admonition_type = context.admonition_type
62 context.in_admonition = True
63 context.admonition_type = admonition_type
64 title_elem = element.find( class_ = 'admonition-title' )
65 title = (
66 title_elem.get_text( strip = True )
67 if title_elem else admonition_type )
68 content_parts: list[ str ] = [ ]
69 for child in element.children:
70 if ( hasattr( child, 'get' )
71 and 'admonition-title' in child.get( 'class', [ ] ) ):
72 continue
73 converted = _convert_element_to_markdown( child, context )
74 if converted.strip( ):
75 content_parts.append( converted.strip( ) )
76 context.in_admonition = old_in_admonition
77 context.admonition_type = old_admonition_type
78 content = ' '.join( content_parts )
79 return f"**{title}**: {content}\n\n" if content else ''
82def _convert_children(
83 element: __.typx.Any, context: _MarkdownContext
84) -> str:
85 ''' Converts all child elements to markdown. '''
86 result_parts: list[ str ] = [ ]
87 for child in element.children:
88 converted = _convert_element_to_markdown( child, context )
89 result_parts.append( converted )
90 return ''.join( result_parts )
93def _convert_code_block(
94 element: __.typx.Any, context: _MarkdownContext
95) -> str:
96 ''' Converts code block with language detection. '''
97 language = _detect_code_language( element )
98 code_element = element.find( 'code' ) or element.find( 'pre' )
99 code_text = (
100 code_element.get_text( ) if code_element else element.get_text( ) )
101 code_text = code_text.strip( )
102 if not code_text: return ''
103 if language: return f"```{language}\n{code_text}\n```\n\n"
104 return f"```\n{code_text}\n```\n\n"
107def _convert_definition_list(
108 element: __.typx.Any, context: _MarkdownContext
109) -> str:
110 ''' Converts definition list to markdown format. '''
111 children = _convert_children( element, context )
112 return f"{children}\n" if children.strip( ) else ''
115def _convert_div( element: __.typx.Any, context: _MarkdownContext ) -> str:
116 ''' Converts div elements with special handling for MkDocs patterns. '''
117 classes = element.get( 'class', [ ] )
118 if isinstance( classes, str ):
119 classes = classes.split( )
120 if 'admonition' in classes:
121 return _convert_admonition( element, context )
122 if 'highlight' in classes or 'codehilite' in classes:
123 return _convert_code_block( element, context )
124 if 'superfences' in classes:
125 return _convert_code_block( element, context )
126 children = _convert_children( element, context )
127 return f"{children}\n\n" if children.strip( ) else ''
130def _convert_element_to_markdown(
131 element: __.typx.Any, context: _MarkdownContext
132) -> str:
133 ''' Converts HTML element to markdown using single-pass traversal. '''
134 if hasattr( element, 'name' ) and element.name:
135 return _convert_tag_to_markdown( element, context )
136 return str( element )
139def _convert_header( element: __.typx.Any ) -> str:
140 ''' Converts header element to markdown. '''
141 text = element.get_text( strip = True )
142 if not text:
143 return ''
144 level = int( element.name[ 1 ] )
145 prefix = '#' * level
146 return f"{prefix} {text}\n\n"
149def _convert_inline_code( element: __.typx.Any ) -> str:
150 ''' Converts inline code element. '''
151 text = element.get_text( )
152 return f"`{text}`"
155def _convert_link( element: __.typx.Any, context: _MarkdownContext ) -> str:
156 ''' Converts anchor element to markdown link. '''
157 href = element.get( 'href', '' )
158 text = element.get_text( )
159 if href and not href.startswith( '#' ):
160 return f"[{text}]({href})"
161 return text
164def _convert_preformatted(
165 element: __.typx.Any, context: _MarkdownContext
166) -> str:
167 ''' Converts preformatted text block. '''
168 language = _detect_code_language( element )
169 text = element.get_text( )
170 if not text.strip( ):
171 return ''
172 if language:
173 return f"```{language}\n{text}\n```\n\n"
174 return f"```\n{text}\n```\n\n"
177def _convert_span( element: __.typx.Any, context: _MarkdownContext ) -> str:
178 ''' Converts span element with special handling for mkdocstrings. '''
179 classes = element.get( 'class', [ ] )
180 if isinstance( classes, str ):
181 classes = classes.split( )
182 if 'doc-heading' in classes:
183 children = _convert_children( element, context )
184 return f"**{children}**" if children.strip( ) else ''
185 return _convert_children( element, context )
188def _convert_table( element: __.typx.Any, context: _MarkdownContext ) -> str:
189 ''' Converts HTML table to simple text representation. '''
190 rows: list[ str ] = [ ]
191 for row in element.find_all( 'tr' ):
192 cells: list[ str ] = [ ]
193 for cell in row.find_all( [ 'td', 'th' ] ):
194 cell_text = cell.get_text( strip = True )
195 cells.append( cell_text )
196 if cells:
197 rows.append( ' | '.join( cells ) )
198 return '\n'.join( rows ) + '\n\n' if rows else ''
201def _convert_tag_to_markdown( # noqa: C901, PLR0911, PLR0912
202 element: __.typx.Any, context: _MarkdownContext
203) -> str:
204 ''' Converts HTML tag to markdown with MkDocs-specific handling. '''
205 if _should_skip_element( element ): return ''
206 match element.name:
207 case 'code': return _convert_inline_code( element )
208 case 'pre': return _convert_preformatted( element, context )
209 case 'strong' | 'b':
210 children = _convert_children( element, context )
211 return f"**{children}**" if children.strip( ) else ''
212 case 'em' | 'i':
213 children = _convert_children( element, context )
214 return f"*{children}*" if children.strip( ) else ''
215 case 'a': return _convert_link( element, context )
216 case 'span': return _convert_span( element, context )
217 case 'div': return _convert_div( element, context )
218 case 'p' | 'section' | 'article':
219 children = _convert_children( element, context )
220 return f"{children}\n\n" if children.strip( ) else ''
221 case 'li':
222 children = _convert_children( element, context )
223 return f"- {children}\n" if children.strip( ) else ''
224 case 'ul' | 'ol':
225 children = _convert_children( element, context )
226 return f"{children}\n" if children.strip( ) else ''
227 case 'dl':
228 return _convert_definition_list( element, context )
229 case 'dt':
230 children = _convert_children( element, context )
231 return f"**{children}**" if children.strip( ) else ''
232 case 'dd':
233 children = _convert_children( element, context )
234 return f": {children}\n" if children.strip( ) else ''
235 case 'h1' | 'h2' | 'h3' | 'h4' | 'h5' | 'h6':
236 return _convert_header( element )
237 case 'br': return '\n'
238 case 'table' | 'tr' | 'td' | 'th' | 'thead' | 'tbody':
239 return _convert_table( element, context )
240 case _:
241 return _convert_children( element, context )
244def _clean_whitespace( text: str ) -> str:
245 ''' Cleans up whitespace while preserving markdown structure. '''
246 text = __.re.sub( r' +', ' ', text )
247 text = __.re.sub( r'\n +', '\n', text )
248 text = __.re.sub( r' +\n', '\n', text )
249 text = __.re.sub( r'\n{3,}', '\n\n', text )
250 text = __.re.sub( r'^[ \t]+|[ \t]+$', '', text, flags = __.re.MULTILINE )
251 return text.strip( )
254def _detect_code_language( element: __.typx.Any ) -> str:
255 ''' Detects programming language using universal patterns. '''
256 return _extract_code_language( element )
259def _should_skip_element( element: __.typx.Any ) -> bool:
260 ''' Determines if element should be skipped entirely. '''
261 classes = element.get( 'class', [ ] )
262 if isinstance( classes, str ):
263 classes = classes.split( )
264 skip_classes = {
265 'md-nav', 'md-header', 'md-footer', 'md-sidebar',
266 'headerlink', 'md-clipboard', 'md-top',
267 'toc', 'navigation', 'skip-link'
268 }
269 return (
270 any( cls in skip_classes for cls in classes )
271 or element.get( 'role' ) in ( 'navigation', 'banner', 'contentinfo' )
272 )