Coverage for sources/librovore/structures/sphinx/conversion.py: 0%

32 statements  

« prev     ^ index     » next       coverage.py v7.10.7, created at 2025-09-28 22:09 +0000

1# vim: set filetype=python fileencoding=utf-8: 

2# -*- coding: utf-8 -*- 

3 

4#============================================================================# 

5# # 

6# Licensed under the Apache License, Version 2.0 (the "License"); # 

7# you may not use this file except in compliance with the License. # 

8# You may obtain a copy of the License at # 

9# # 

10# http://www.apache.org/licenses/LICENSE-2.0 # 

11# # 

12# Unless required by applicable law or agreed to in writing, software # 

13# distributed under the License is distributed on an "AS IS" BASIS, # 

14# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # 

15# See the License for the specific language governing permissions and # 

16# limitations under the License. # 

17# # 

18#============================================================================# 

19 

20 

21''' HTML to markdown conversion utilities. ''' 

22 

23 

24from . import __ 

25 

26from .converters import convert_code_block_to_markdown as _convert_code_block 

27 

28 

29class SphinxMarkdownConverter( __.markdownify.MarkdownConverter ): 

30 ''' Custom markdownify converter for Sphinx using universal patterns. ''' 

31 

32 def convert_pre( 

33 self, 

34 el: __.typx.Annotated[ 

35 __.typx.Any, 

36 __.ddoc.Doc( '''HTML pre element to convert.''' ), 

37 ], 

38 text: __.typx.Annotated[ 

39 str, 

40 __.ddoc.Doc( '''Text content of the element.''' ), 

41 ], 

42 convert_as_inline: __.typx.Annotated[ 

43 bool, 

44 __.ddoc.Doc( '''Whether to convert as inline element.''' ), 

45 ], 

46 ) -> __.typx.Annotated[ 

47 str, 

48 __.ddoc.Doc( '''Converted markdown text.''' ), 

49 ]: 

50 ''' Converts pre elements with Sphinx code block detection. ''' 

51 if self.is_code_block( el ): 

52 return _convert_code_block( el ) 

53 return super( ).convert_pre( el, text, convert_as_inline ) 

54 

55 def is_code_block( 

56 self, 

57 element: __.typx.Annotated[ 

58 __.typx.Any, 

59 __.ddoc.Doc( '''HTML element to check for code block.''' ), 

60 ], 

61 ) -> __.typx.Annotated[ 

62 bool, 

63 __.ddoc.Doc( '''True if element represents a code block.''' ), 

64 ]: 

65 ''' Determines if element is a code block using universal patterns. ''' 

66 classes = element.get( 'class', [ ] ) 

67 if 'highlight' in classes: return True 

68 parent = element.parent 

69 if parent: 

70 parent_classes = parent.get( 'class', [ ] ) 

71 for cls in parent_classes: 

72 if cls.startswith( 'highlight-' ): return True 

73 return False 

74 

75 

76def html_to_markdown( 

77 html_text: __.typx.Annotated[ 

78 str, 

79 __.ddoc.Doc( '''HTML text to convert to markdown.''' ), 

80 ], 

81) -> __.typx.Annotated[ 

82 str, 

83 __.ddoc.Doc( '''Converted markdown with Sphinx-specific processing.''' ), 

84]: 

85 ''' Converts HTML text to markdown using Sphinx-specific patterns. ''' 

86 if not html_text.strip( ): return '' 

87 try: cleaned_html = _preprocess_sphinx_html( html_text ) 

88 except Exception: return html_text 

89 try: 

90 converter = SphinxMarkdownConverter( 

91 heading_style = 'ATX', 

92 strip = [ 'nav', 'header', 'footer' ], 

93 escape_underscores = False, 

94 escape_asterisks = False 

95 ) 

96 markdown = converter.convert( cleaned_html ) 

97 except Exception: return html_text 

98 return markdown.strip( ) 

99 

100 

101def html_to_markdown_sphinx( 

102 html_text: __.typx.Annotated[ 

103 str, 

104 __.ddoc.Doc( '''HTML text to convert using Sphinx patterns.''' ), 

105 ], 

106) -> __.typx.Annotated[ 

107 str, 

108 __.ddoc.Doc( '''Converted markdown text.''' ), 

109]: 

110 ''' Converts HTML to markdown using Sphinx universal patterns. ''' 

111 return html_to_markdown( html_text ) 

112 

113 

114def _preprocess_sphinx_html( 

115 html_text: __.typx.Annotated[ 

116 str, 

117 __.ddoc.Doc( '''Raw HTML text to preprocess.''' ), 

118 ], 

119) -> __.typx.Annotated[ 

120 str, 

121 __.ddoc.Doc( '''Cleaned HTML text ready for markdown conversion.''' ), 

122]: 

123 ''' Removes Sphinx-specific elements before markdownify processing. ''' 

124 soup = __.bs4.BeautifulSoup( html_text, 'lxml' ) 

125 # Remove headerlink elements (¶ symbols) 

126 for element in soup.find_all( class_ = 'headerlink' ): 

127 element.decompose( ) 

128 return str( soup )