Coverage for sources/librovore/structures/sphinx/conversion.py: 0%

15 statements  

« prev     ^ index     » next       coverage.py v7.10.6, created at 2025-09-03 21:59 +0000

1# vim: set filetype=python fileencoding=utf-8: 

2# -*- coding: utf-8 -*- 

3 

4#============================================================================# 

5# # 

6# Licensed under the Apache License, Version 2.0 (the "License"); # 

7# you may not use this file except in compliance with the License. # 

8# You may obtain a copy of the License at # 

9# # 

10# http://www.apache.org/licenses/LICENSE-2.0 # 

11# # 

12# Unless required by applicable law or agreed to in writing, software # 

13# distributed under the License is distributed on an "AS IS" BASIS, # 

14# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # 

15# See the License for the specific language governing permissions and # 

16# limitations under the License. # 

17# # 

18#============================================================================# 

19 

20 

21''' HTML to markdown conversion utilities. ''' 

22 

23 

24import markdownify as _markdownify 

25 

26from bs4 import BeautifulSoup as _BeautifulSoup 

27 

28 

29def html_to_markdown( html_text: str ) -> str: 

30 ''' Converts HTML text to clean markdown format with proper paragraphs. ''' 

31 if not html_text.strip( ): return '' 

32 try: cleaned_html = _preprocess_sphinx_html( html_text ) 

33 except Exception: return html_text 

34 try: 

35 markdown = _markdownify.markdownify( 

36 cleaned_html, 

37 heading_style = 'ATX', 

38 strip = [ 'nav', 'header', 'footer' ], 

39 escape_underscores = False, 

40 escape_asterisks = False ) 

41 except Exception: return html_text 

42 return markdown.strip( ) 

43 

44 

45def _preprocess_sphinx_html( html_text: str ) -> str: 

46 ''' Removes Sphinx-specific elements before markdownify processing. ''' 

47 soup = _BeautifulSoup( html_text, 'lxml' ) 

48 # Remove headerlink elements (¶ symbols) 

49 for element in soup.find_all( class_ = 'headerlink' ): 

50 element.decompose( ) 

51 return str( soup )