Coverage for sources/librovore/structures/sphinx/conversion.py: 0%

1# vim: set filetype=python fileencoding=utf-8:

2# -*- coding: utf-8 -*-

4#============================================================================#

5# #

6# Licensed under the Apache License, Version 2.0 (the "License"); #

7# you may not use this file except in compliance with the License. #

8# You may obtain a copy of the License at #

9# #

10# http://www.apache.org/licenses/LICENSE-2.0 #

11# #

12# Unless required by applicable law or agreed to in writing, software #

13# distributed under the License is distributed on an "AS IS" BASIS, #

14# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. #

15# See the License for the specific language governing permissions and #

16# limitations under the License. #

17# #

18#============================================================================#

21''' HTML to markdown conversion utilities. '''

24import markdownify as _markdownify

26from bs4 import BeautifulSoup as _BeautifulSoup

29def html_to_markdown( html_text: str ) -> str:

30 ''' Converts HTML text to clean markdown format with proper paragraphs. '''

31 if not html_text.strip( ): return ''

32 try: cleaned_html = _preprocess_sphinx_html( html_text )

33 except Exception: return html_text

34 try:

35 markdown = _markdownify.markdownify(

36 cleaned_html,

37 heading_style = 'ATX',

38 strip = [ 'nav', 'header', 'footer' ],

39 escape_underscores = False,

40 escape_asterisks = False )

41 except Exception: return html_text

42 return markdown.strip( )

45def _preprocess_sphinx_html( html_text: str ) -> str:

46 ''' Removes Sphinx-specific elements before markdownify processing. '''

47 soup = _BeautifulSoup( html_text, 'lxml' )

48 # Remove headerlink elements (¶ symbols)

49 for element in soup.find_all( class_ = 'headerlink' ):

50 element.decompose( )

51 return str( soup )