Coverage for sources/librovore/structures/mkdocs/extraction.py: 11%
94 statements
« prev ^ index » next coverage.py v7.10.7, created at 2025-09-28 22:09 +0000
« prev ^ index » next coverage.py v7.10.7, created at 2025-09-28 22:09 +0000
1# vim: set filetype=python fileencoding=utf-8:
2# -*- coding: utf-8 -*-
4#============================================================================#
5# #
6# Licensed under the Apache License, Version 2.0 (the "License"); #
7# you may not use this file except in compliance with the License. #
8# You may obtain a copy of the License at #
9# #
10# http://www.apache.org/licenses/LICENSE-2.0 #
11# #
12# Unless required by applicable law or agreed to in writing, software #
13# distributed under the License is distributed on an "AS IS" BASIS, #
14# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. #
15# See the License for the specific language governing permissions and #
16# limitations under the License. #
17# #
18#============================================================================#
21''' MkDocs documentation content extraction and processing. '''
24from bs4 import BeautifulSoup as _BeautifulSoup
26from . import __
27from .patterns import THEME_PATTERNS as _THEME_PATTERNS
28from .patterns import UNIVERSAL_PATTERNS as _UNIVERSAL_PATTERNS
31async def extract_contents(
32 auxdata: __.ApplicationGlobals,
33 source: str,
34 objects: __.cabc.Sequence[ __.InventoryObject ], /, *,
35 theme: __.Absential[ str ] = __.absent,
36) -> list[ __.ContentDocument ]:
37 ''' Extracts documentation content for specified objects from MkDocs. '''
38 base_url = __.normalize_base_url( source )
39 if not objects: return [ ]
40 tasks = [
41 _extract_object_documentation(
42 auxdata, base_url, source, obj, theme )
43 for obj in objects ]
44 candidate_results = await __.asyncf.gather_async(
45 *tasks, return_exceptions = True )
46 results: list[ __.ContentDocument ] = [
47 result.value for result in candidate_results
48 if __.generics.is_value( result ) and result.value is not None ]
49 return results
52def parse_mkdocs_html(
53 content: str, element_id: str, url: str, *,
54 theme: __.Absential[ str ] = __.absent
55) -> __.cabc.Mapping[ str, str ]:
56 ''' Parses MkDocs HTML content to extract documentation sections. '''
57 try: soup = _BeautifulSoup( content, 'lxml' )
58 except Exception as exc:
59 raise __.DocumentationParseFailure( element_id, exc ) from exc
60 main_container = _find_main_content_container( soup, theme )
61 if __.is_absent( main_container ):
62 raise __.DocumentationContentAbsence( element_id )
63 target_element = _find_target_element( main_container, element_id )
64 if not target_element:
65 raise __.DocumentationObjectAbsence( element_id, url )
66 description = _extract_content_from_element(
67 target_element, element_id, theme )
68 return {
69 'description': description,
70 'object_name': element_id,
71 }
76def _cleanup_content(
77 content: str,
78 cleanup_selectors: __.cabc.Sequence[ str ]
79) -> str:
80 ''' Removes unwanted elements from content. '''
81 # TODO: Implement more sophisticated cleanup
82 return content
85def _convert_to_markdown( html_content: str ) -> str:
86 ''' Converts HTML content to markdown format using markdownify. '''
87 import markdownify
88 return markdownify.markdownify( html_content, heading_style = 'ATX' )
91def _derive_documentation_url(
92 base_url: __.typx.Any, uri: str, object_name: str
93) -> __.typx.Any:
94 ''' Derives documentation URL from base URL and object URI. '''
95 if uri.endswith( '#$' ):
96 # mkdocstrings pattern - replace #$ with object name anchor
97 clean_uri = uri[ :-2 ]
98 new_path = f"{base_url.path}/{clean_uri}"
99 return base_url._replace( path = new_path, fragment = object_name )
100 if '#' in uri:
101 path_part, fragment = uri.split( '#', 1 )
102 new_path = f"{base_url.path}/{path_part}"
103 return base_url._replace( path = new_path, fragment = fragment )
104 new_path = f"{base_url.path}/{uri}"
105 return base_url._replace( path = new_path, fragment = object_name )
108def _extract_content_from_element(
109 element: __.typx.Any,
110 element_id: str,
111 theme: __.Absential[ str ] = __.absent
112) -> str:
113 ''' Extracts description content using universal patterns. '''
114 description = _extract_description( element )
115 cleanup_selectors = _UNIVERSAL_PATTERNS[ 'navigation_cleanup' ][
116 'universal_selectors'
117 ]
118 return _cleanup_content( description, cleanup_selectors )
121def _extract_description( element: __.typx.Any ) -> str:
122 ''' Extracts description content from element. '''
123 doc_contents = _find_doc_contents_container( element )
124 if doc_contents:
125 return doc_contents.decode_contents( )
126 return ''
129async def _extract_object_documentation(
130 auxdata: __.ApplicationGlobals,
131 base_url: __.typx.Any,
132 location: str,
133 obj: __.InventoryObject,
134 theme: __.Absential[ str ] = __.absent,
135) -> __.ContentDocument | None:
136 ''' Extracts documentation for a single object from MkDocs site. '''
137 doc_url = _derive_documentation_url(
138 base_url, obj.uri, obj.name )
139 try:
140 html_content = (
141 await __.retrieve_url_as_text(
142 auxdata.content_cache, doc_url ) )
143 except Exception as exc:
144 __.acquire_scribe( __name__ ).debug(
145 "Failed to retrieve %s: %s", doc_url, exc )
146 return None
147 anchor = doc_url.fragment or str( obj.name )
148 try:
149 parsed_content = parse_mkdocs_html(
150 html_content, anchor, str( doc_url ), theme = theme )
151 except Exception: return None
152 description = _convert_to_markdown( parsed_content[ 'description' ] )
153 content_id = __.produce_content_id( location, obj.name )
154 return __.ContentDocument(
155 inventory_object = obj,
156 content_id = content_id,
157 description = description,
158 documentation_url = doc_url.geturl( ),
159 extraction_metadata = __.immut.Dictionary( {
160 'theme': theme if not __.is_absent( theme ) else 'unknown',
161 'extraction_method': 'mkdocs_html_parsing',
162 'relevance_score': 1.0,
163 'match_reasons': [ 'direct extraction' ],
164 } )
165 )
171def _find_doc_contents_container( element: __.typx.Any ) -> __.typx.Any | None:
172 ''' Finds the doc-contents container for the element. '''
173 if element.name in ( 'h1', 'h2', 'h3', 'h4', 'h5', 'h6' ):
174 sibling = element.next_sibling
175 while sibling:
176 if (
177 hasattr( sibling, 'get' ) and sibling.name == 'div' and
178 'doc-contents' in sibling.get( 'class', [ ] )
179 ): return sibling
180 sibling = sibling.next_sibling
181 return element.select_one( '.doc-contents' )
184def _find_target_element(
185 container: __.typx.Any, element_id: str
186) -> __.typx.Any:
187 ''' Finds target element within main container using ID strategies. '''
188 target = container.find( id = element_id )
189 if target: return target
190 target = container.find( attrs = { 'data-toc-label': element_id } )
191 if target: return target
192 for heading in container.find_all(
193 [ 'h1', 'h2', 'h3', 'h4', 'h5', 'h6' ] ):
194 if element_id in heading.get_text( ):
195 return heading
196 for section in container.find_all( 'section' ):
197 class_attr = section.get( 'class' )
198 if class_attr and element_id in ' '.join( class_attr ):
199 return section
200 return container
203def _find_main_content_container(
204 soup: __.typx.Any, theme: __.Absential[ str ] = __.absent
205) -> __.Absential[ __.typx.Any ]:
206 ''' Finds main content container trying theme-specific patterns first. '''
207 if (
208 not __.is_absent( theme )
209 and theme in _THEME_PATTERNS[ 'content_containers' ]
210 ):
211 theme_selectors = _THEME_PATTERNS[ 'content_containers' ][
212 theme
213 ]
214 for selector in theme_selectors:
215 container = soup.select_one( selector )
216 if container: return container
217 content_config = _UNIVERSAL_PATTERNS[ 'content_containers' ]
218 universal_selectors = content_config[ 'universal_selectors' ]
219 for selector in universal_selectors:
220 container = soup.select_one( selector )
221 if container: return container
222 return __.absent