Coverage for sources/librovore/structures/sphinx/conversion.py: 0%
32 statements
« prev ^ index » next coverage.py v7.11.0, created at 2025-10-20 18:40 +0000
« prev ^ index » next coverage.py v7.11.0, created at 2025-10-20 18:40 +0000
1# vim: set filetype=python fileencoding=utf-8:
2# -*- coding: utf-8 -*-
4#============================================================================#
5# #
6# Licensed under the Apache License, Version 2.0 (the "License"); #
7# you may not use this file except in compliance with the License. #
8# You may obtain a copy of the License at #
9# #
10# http://www.apache.org/licenses/LICENSE-2.0 #
11# #
12# Unless required by applicable law or agreed to in writing, software #
13# distributed under the License is distributed on an "AS IS" BASIS, #
14# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. #
15# See the License for the specific language governing permissions and #
16# limitations under the License. #
17# #
18#============================================================================#
21''' HTML to markdown conversion utilities. '''
24from . import __
26from .converters import convert_code_block_to_markdown as _convert_code_block
29class SphinxMarkdownConverter( __.markdownify.MarkdownConverter ):
30 ''' Custom markdownify converter for Sphinx using universal patterns. '''
32 def convert_pre(
33 self,
34 el: __.typx.Annotated[
35 __.typx.Any,
36 __.ddoc.Doc( '''HTML pre element to convert.''' ),
37 ],
38 text: __.typx.Annotated[
39 str,
40 __.ddoc.Doc( '''Text content of the element.''' ),
41 ],
42 convert_as_inline: __.typx.Annotated[
43 bool,
44 __.ddoc.Doc( '''Whether to convert as inline element.''' ),
45 ],
46 ) -> __.typx.Annotated[
47 str,
48 __.ddoc.Doc( '''Converted markdown text.''' ),
49 ]:
50 ''' Converts pre elements with Sphinx code block detection. '''
51 if self.is_code_block( el ):
52 return _convert_code_block( el )
53 return super( ).convert_pre( el, text, convert_as_inline )
55 def is_code_block(
56 self,
57 element: __.typx.Annotated[
58 __.typx.Any,
59 __.ddoc.Doc( '''HTML element to check for code block.''' ),
60 ],
61 ) -> __.typx.Annotated[
62 bool,
63 __.ddoc.Doc( '''True if element represents a code block.''' ),
64 ]:
65 ''' Determines if element is a code block using universal patterns. '''
66 classes = element.get( 'class', [ ] )
67 if 'highlight' in classes: return True
68 parent = element.parent
69 if parent:
70 parent_classes = parent.get( 'class', [ ] )
71 for cls in parent_classes:
72 if cls.startswith( 'highlight-' ): return True
73 return False
76def html_to_markdown(
77 html_text: __.typx.Annotated[
78 str,
79 __.ddoc.Doc( '''HTML text to convert to markdown.''' ),
80 ],
81) -> __.typx.Annotated[
82 str,
83 __.ddoc.Doc( '''Converted markdown with Sphinx-specific processing.''' ),
84]:
85 ''' Converts HTML text to markdown using Sphinx-specific patterns. '''
86 if not html_text.strip( ): return ''
87 try: cleaned_html = _preprocess_sphinx_html( html_text )
88 except Exception: return html_text
89 try:
90 converter = SphinxMarkdownConverter(
91 heading_style = 'ATX',
92 strip = [ 'nav', 'header', 'footer' ],
93 escape_underscores = False,
94 escape_asterisks = False
95 )
96 markdown = converter.convert( cleaned_html )
97 except Exception: return html_text
98 return markdown.strip( )
101def html_to_markdown_sphinx(
102 html_text: __.typx.Annotated[
103 str,
104 __.ddoc.Doc( '''HTML text to convert using Sphinx patterns.''' ),
105 ],
106) -> __.typx.Annotated[
107 str,
108 __.ddoc.Doc( '''Converted markdown text.''' ),
109]:
110 ''' Converts HTML to markdown using Sphinx universal patterns. '''
111 return html_to_markdown( html_text )
114def _preprocess_sphinx_html(
115 html_text: __.typx.Annotated[
116 str,
117 __.ddoc.Doc( '''Raw HTML text to preprocess.''' ),
118 ],
119) -> __.typx.Annotated[
120 str,
121 __.ddoc.Doc( '''Cleaned HTML text ready for markdown conversion.''' ),
122]:
123 ''' Removes Sphinx-specific elements before markdownify processing. '''
124 soup = __.bs4.BeautifulSoup( html_text, 'lxml' )
125 # Remove headerlink elements (¶ symbols)
126 for element in soup.find_all( class_ = 'headerlink' ):
127 element.decompose( )
128 return str( soup )