Coverage for sources/librovore/structures/sphinx/conversion.py: 0%
15 statements
« prev ^ index » next coverage.py v7.10.5, created at 2025-08-29 01:14 +0000
« prev ^ index » next coverage.py v7.10.5, created at 2025-08-29 01:14 +0000
1# vim: set filetype=python fileencoding=utf-8:
2# -*- coding: utf-8 -*-
4#============================================================================#
5# #
6# Licensed under the Apache License, Version 2.0 (the "License"); #
7# you may not use this file except in compliance with the License. #
8# You may obtain a copy of the License at #
9# #
10# http://www.apache.org/licenses/LICENSE-2.0 #
11# #
12# Unless required by applicable law or agreed to in writing, software #
13# distributed under the License is distributed on an "AS IS" BASIS, #
14# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. #
15# See the License for the specific language governing permissions and #
16# limitations under the License. #
17# #
18#============================================================================#
21''' HTML to markdown conversion utilities. '''
24import markdownify as _markdownify
26from bs4 import BeautifulSoup as _BeautifulSoup
29def html_to_markdown( html_text: str ) -> str:
30 ''' Converts HTML text to clean markdown format with proper paragraphs. '''
31 if not html_text.strip( ): return ''
32 try: cleaned_html = _preprocess_sphinx_html( html_text )
33 except Exception: return html_text
34 try:
35 markdown = _markdownify.markdownify(
36 cleaned_html,
37 heading_style = 'ATX',
38 strip = [ 'nav', 'header', 'footer' ],
39 escape_underscores = False,
40 escape_asterisks = False )
41 except Exception: return html_text
42 return markdown.strip( )
45def _preprocess_sphinx_html( html_text: str ) -> str:
46 ''' Removes Sphinx-specific elements before markdownify processing. '''
47 soup = _BeautifulSoup( html_text, 'lxml' )
48 # Remove headerlink elements (¶ symbols)
49 for element in soup.find_all( class_ = 'headerlink' ):
50 element.decompose( )
51 return str( soup )