Coverage for sources/librovore/structures/sphinx/conversion.py: 0%

1# vim: set filetype=python fileencoding=utf-8:

2# -*- coding: utf-8 -*-

4#============================================================================#

5# #

6# Licensed under the Apache License, Version 2.0 (the "License"); #

7# you may not use this file except in compliance with the License. #

8# You may obtain a copy of the License at #

9# #

10# http://www.apache.org/licenses/LICENSE-2.0 #

11# #

12# Unless required by applicable law or agreed to in writing, software #

13# distributed under the License is distributed on an "AS IS" BASIS, #

14# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. #

15# See the License for the specific language governing permissions and #

16# limitations under the License. #

17# #

18#============================================================================#

21''' HTML to markdown conversion utilities. '''

24from . import __

26from .converters import convert_code_block_to_markdown as _convert_code_block

29class SphinxMarkdownConverter( __.markdownify.MarkdownConverter ):

30 ''' Custom markdownify converter for Sphinx using universal patterns. '''

32 def convert_pre(

33 self,

34 el: __.typx.Annotated[

35 __.typx.Any,

36 __.ddoc.Doc( '''HTML pre element to convert.''' ),

37 ],

38 text: __.typx.Annotated[

39 str,

40 __.ddoc.Doc( '''Text content of the element.''' ),

41 ],

42 convert_as_inline: __.typx.Annotated[

43 bool,

44 __.ddoc.Doc( '''Whether to convert as inline element.''' ),

45 ],

46 ) -> __.typx.Annotated[

47 str,

48 __.ddoc.Doc( '''Converted markdown text.''' ),

49 ]:

50 ''' Converts pre elements with Sphinx code block detection. '''

51 if self.is_code_block( el ):

52 return _convert_code_block( el )

53 return super( ).convert_pre( el, text, convert_as_inline )

55 def is_code_block(

56 self,

57 element: __.typx.Annotated[

58 __.typx.Any,

59 __.ddoc.Doc( '''HTML element to check for code block.''' ),

60 ],

61 ) -> __.typx.Annotated[

62 bool,

63 __.ddoc.Doc( '''True if element represents a code block.''' ),

64 ]:

65 ''' Determines if element is a code block using universal patterns. '''

66 classes = element.get( 'class', [ ] )

67 if 'highlight' in classes: return True

68 parent = element.parent

69 if parent:

70 parent_classes = parent.get( 'class', [ ] )

71 for cls in parent_classes:

72 if cls.startswith( 'highlight-' ): return True

73 return False

76def html_to_markdown(

77 html_text: __.typx.Annotated[

78 str,

79 __.ddoc.Doc( '''HTML text to convert to markdown.''' ),

80 ],

81) -> __.typx.Annotated[

82 str,

83 __.ddoc.Doc( '''Converted markdown with Sphinx-specific processing.''' ),

84]:

85 ''' Converts HTML text to markdown using Sphinx-specific patterns. '''

86 if not html_text.strip( ): return ''

87 try: cleaned_html = _preprocess_sphinx_html( html_text )

88 except Exception: return html_text

89 try:

90 converter = SphinxMarkdownConverter(

91 heading_style = 'ATX',

92 strip = [ 'nav', 'header', 'footer' ],

93 escape_underscores = False,

94 escape_asterisks = False

95 )

96 markdown = converter.convert( cleaned_html )

97 except Exception: return html_text

98 return markdown.strip( )

100

101def html_to_markdown_sphinx(

102 html_text: __.typx.Annotated[

103 str,

104 __.ddoc.Doc( '''HTML text to convert using Sphinx patterns.''' ),

105 ],

106) -> __.typx.Annotated[

107 str,

108 __.ddoc.Doc( '''Converted markdown text.''' ),

109]:

110 ''' Converts HTML to markdown using Sphinx universal patterns. '''

111 return html_to_markdown( html_text )

112

113

114def _preprocess_sphinx_html(

115 html_text: __.typx.Annotated[

116 str,

117 __.ddoc.Doc( '''Raw HTML text to preprocess.''' ),

118 ],

119) -> __.typx.Annotated[

120 str,

121 __.ddoc.Doc( '''Cleaned HTML text ready for markdown conversion.''' ),

122]:

123 ''' Removes Sphinx-specific elements before markdownify processing. '''

124 soup = __.bs4.BeautifulSoup( html_text, 'lxml' )

125 # Remove headerlink elements (¶ symbols)

126 for element in soup.find_all( class_ = 'headerlink' ):

127 element.decompose( )

128 return str( soup )