Coverage for sources/librovore/structures/mkdocs/extraction.py: 11%
104 statements
« prev ^ index » next coverage.py v7.10.6, created at 2025-09-06 02:25 +0000
« prev ^ index » next coverage.py v7.10.6, created at 2025-09-06 02:25 +0000
1# vim: set filetype=python fileencoding=utf-8:
2# -*- coding: utf-8 -*-
4#============================================================================#
5# #
6# Licensed under the Apache License, Version 2.0 (the "License"); #
7# you may not use this file except in compliance with the License. #
8# You may obtain a copy of the License at #
9# #
10# http://www.apache.org/licenses/LICENSE-2.0 #
11# #
12# Unless required by applicable law or agreed to in writing, software #
13# distributed under the License is distributed on an "AS IS" BASIS, #
14# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. #
15# See the License for the specific language governing permissions and #
16# limitations under the License. #
17# #
18#============================================================================#
21''' MkDocs documentation content extraction and processing. '''
24from bs4 import BeautifulSoup as _BeautifulSoup
26from . import __
29MATERIAL_THEME_PATTERNS: __.cabc.Mapping[
30 str, __.cabc.Mapping[ str, __.typx.Any ]
31] = __.immut.Dictionary( {
32 'material': __.immut.Dictionary( {
33 'main_content_selectors': [
34 'article[role="main"]',
35 '.md-content__inner',
36 '.md-typeset',
37 'main .md-content',
38 ],
39 'api_section_selectors': [
40 '.doc.doc-object-member',
41 '.doc.doc-children',
42 'section[id]',
43 '.highlight',
44 ],
45 'description_selectors': [
46 '.doc-contents',
47 '.doc-object-member .doc-contents',
48 'p',
49 '.admonition',
50 ],
51 'cleanup_selectors': [
52 '.md-nav',
53 '.md-header',
54 '.md-footer',
55 '.md-sidebar',
56 '.headerlink',
57 '.md-clipboard',
58 'a.md-top',
59 ],
60 'code_block_selectors': [
61 '.highlight',
62 'pre code',
63 '.codehilite',
64 ],
65 } ),
66 'readthedocs': __.immut.Dictionary( {
67 'main_content_selectors': [
68 '.wy-nav-content-wrap main',
69 '.document',
70 '[role="main"]',
71 ],
72 'api_section_selectors': [
73 '.section',
74 'dl.class',
75 'dl.function',
76 'dl.method',
77 ],
78 'description_selectors': [
79 'dd',
80 '.field-body',
81 'p',
82 ],
83 'cleanup_selectors': [
84 '.headerlink',
85 '.wy-nav-top',
86 '.wy-nav-side',
87 ],
88 'code_block_selectors': [
89 '.highlight',
90 'pre',
91 ],
92 } ),
93} )
95_GENERIC_PATTERN = __.immut.Dictionary( {
96 'main_content_selectors': [
97 'article[role="main"]',
98 'main',
99 '.content',
100 '.document',
101 'body',
102 ],
103 'api_section_selectors': [
104 'section[id]',
105 'div[id]',
106 '.doc-object-member',
107 'dl',
108 ],
109 'description_selectors': [
110 'p',
111 'dd',
112 '.description',
113 '.doc-contents',
114 ],
115 'cleanup_selectors': [
116 '.headerlink',
117 'nav',
118 'header',
119 'footer',
120 '.sidebar',
121 ],
122 'code_block_selectors': [
123 '.highlight',
124 'pre',
125 'code',
126 ],
127} )
130async def extract_contents(
131 auxdata: __.ApplicationGlobals,
132 source: str,
133 objects: __.cabc.Sequence[ __.InventoryObject ], /, *,
134 theme: __.Absential[ str ] = __.absent,
135) -> list[ __.ContentDocument ]:
136 ''' Extracts documentation content for specified objects from MkDocs. '''
137 base_url = __.normalize_base_url( source )
138 if not objects: return [ ]
139 tasks = [
140 _extract_object_documentation(
141 auxdata, base_url, source, obj, theme )
142 for obj in objects ]
143 candidate_results = await __.asyncf.gather_async(
144 *tasks, return_exceptions = True )
145 results: list[ __.ContentDocument ] = [
146 result.value for result in candidate_results
147 if __.generics.is_value( result ) and result.value is not None ]
148 return results
151def parse_mkdocs_html(
152 content: str, element_id: str, url: str, *,
153 theme: __.Absential[ str ] = __.absent
154) -> __.cabc.Mapping[ str, str ]:
155 ''' Parses MkDocs HTML content to extract documentation sections. '''
156 try: soup = _BeautifulSoup( content, 'lxml' )
157 except Exception as exc:
158 raise __.DocumentationParseFailure( element_id, exc ) from exc
159 main_container = _find_main_content_container( soup, theme )
160 if __.is_absent( main_container ):
161 raise __.DocumentationContentAbsence( element_id )
162 target_element = _find_target_element( main_container, element_id )
163 if not target_element:
164 raise __.DocumentationObjectAbsence( element_id, url )
165 description = _extract_content_from_element(
166 target_element, element_id, theme )
167 return {
168 'description': description,
169 'object_name': element_id,
170 }
175def _cleanup_content(
176 content: str,
177 cleanup_selectors: __.cabc.Sequence[ str ]
178) -> str:
179 ''' Removes unwanted elements from content. '''
180 # TODO: Implement more sophisticated cleanup
181 return content
184def _convert_to_markdown( html_content: str ) -> str:
185 ''' Converts HTML content to markdown format using markdownify. '''
186 import markdownify
187 return markdownify.markdownify( html_content, heading_style = 'ATX' )
190def _derive_documentation_url(
191 base_url: __.typx.Any, uri: str, object_name: str
192) -> __.typx.Any:
193 ''' Derives documentation URL from base URL and object URI. '''
194 if uri.endswith( '#$' ):
195 # mkdocstrings pattern - replace #$ with object name anchor
196 clean_uri = uri[ :-2 ]
197 new_path = f"{base_url.path}/{clean_uri}"
198 return base_url._replace( path = new_path, fragment = object_name )
199 if '#' in uri:
200 path_part, fragment = uri.split( '#', 1 )
201 new_path = f"{base_url.path}/{path_part}"
202 return base_url._replace( path = new_path, fragment = fragment )
203 new_path = f"{base_url.path}/{uri}"
204 return base_url._replace( path = new_path, fragment = object_name )
207def _extract_content_from_element(
208 element: __.typx.Any,
209 element_id: str,
210 theme: __.Absential[ str ] = __.absent
211) -> str:
212 ''' Extracts description content from element. '''
213 theme_name = theme if not __.is_absent( theme ) else 'material'
214 patterns = MATERIAL_THEME_PATTERNS.get( theme_name, _GENERIC_PATTERN )
215 description = _extract_description( element, patterns )
216 cleanup_selectors = __.typx.cast(
217 __.cabc.Sequence[ str ], patterns[ 'cleanup_selectors' ] )
218 return _cleanup_content( description, cleanup_selectors )
221def _extract_description(
222 element: __.typx.Any,
223 patterns: __.cabc.Mapping[ str, __.typx.Any ]
224) -> str:
225 ''' Extracts description content from element. '''
226 doc_contents = _find_doc_contents_container( element )
227 if doc_contents:
228 return doc_contents.decode_contents( )
229 descriptions = _extract_using_fallback_selectors( element, patterns )
230 return '\n\n'.join( descriptions ) if descriptions else ''
233async def _extract_object_documentation(
234 auxdata: __.ApplicationGlobals,
235 base_url: __.typx.Any,
236 location: str,
237 obj: __.InventoryObject,
238 theme: __.Absential[ str ] = __.absent,
239) -> __.ContentDocument | None:
240 ''' Extracts documentation for a single object from MkDocs site. '''
241 doc_url = _derive_documentation_url(
242 base_url, obj.uri, obj.name )
243 try:
244 html_content = (
245 await __.retrieve_url_as_text(
246 auxdata.content_cache, doc_url ) )
247 except Exception as exc:
248 __.acquire_scribe( __name__ ).debug(
249 "Failed to retrieve %s: %s", doc_url, exc )
250 return None
251 anchor = doc_url.fragment or str( obj.name )
252 try:
253 parsed_content = parse_mkdocs_html(
254 html_content, anchor, str( doc_url ), theme = theme )
255 except Exception: return None
256 description = _convert_to_markdown( parsed_content[ 'description' ] )
257 content_id = __.produce_content_id( location, obj.name )
258 return __.ContentDocument(
259 inventory_object = obj,
260 content_id = content_id,
261 description = description,
262 documentation_url = doc_url.geturl( ),
263 extraction_metadata = __.immut.Dictionary( {
264 'theme': theme if not __.is_absent( theme ) else 'unknown',
265 'extraction_method': 'mkdocs_html_parsing',
266 'relevance_score': 1.0,
267 'match_reasons': [ 'direct extraction' ],
268 } )
269 )
274def _extract_using_fallback_selectors(
275 element: __.typx.Any,
276 patterns: __.cabc.Mapping[ str, __.typx.Any ]
277) -> list[ str ]:
278 ''' Extracts description using fallback selectors. '''
279 descriptions: list[ str ] = [ ]
280 description_selectors = __.typx.cast(
281 __.cabc.Sequence[ str ], patterns[ 'description_selectors' ] )
282 for selector in description_selectors:
283 desc_elements = element.select( selector )
284 for desc_elem in desc_elements:
285 if (
286 desc_elem.get( 'class' ) and
287 'admonition-title' in desc_elem.get( 'class', [ ] )
288 ): continue
289 html_content = str( desc_elem )
290 if html_content and html_content not in descriptions:
291 descriptions.append( html_content )
292 return descriptions
295def _find_doc_contents_container( element: __.typx.Any ) -> __.typx.Any | None:
296 ''' Finds the doc-contents container for the element. '''
297 if element.name in ( 'h1', 'h2', 'h3', 'h4', 'h5', 'h6' ):
298 sibling = element.next_sibling
299 while sibling:
300 if (
301 hasattr( sibling, 'get' ) and sibling.name == 'div' and
302 'doc-contents' in sibling.get( 'class', [ ] )
303 ): return sibling
304 sibling = sibling.next_sibling
305 return element.select_one( '.doc-contents' )
308def _find_target_element(
309 container: __.typx.Any, element_id: str
310) -> __.typx.Any:
311 ''' Finds target element within main container using ID strategies. '''
312 target = container.find( id = element_id )
313 if target: return target
314 target = container.find( attrs = { 'data-toc-label': element_id } )
315 if target: return target
316 for heading in container.find_all(
317 [ 'h1', 'h2', 'h3', 'h4', 'h5', 'h6' ] ):
318 if element_id in heading.get_text( ):
319 return heading
320 for section in container.find_all( 'section' ):
321 class_attr = section.get( 'class' )
322 if class_attr and element_id in ' '.join( class_attr ):
323 return section
324 return container
327def _find_main_content_container(
328 soup: __.typx.Any, theme: __.Absential[ str ] = __.absent
329) -> __.Absential[ __.typx.Any ]:
330 ''' Finds main content container using theme-specific strategies. '''
331 theme_name = theme if not __.is_absent( theme ) else 'material'
332 patterns = MATERIAL_THEME_PATTERNS.get( theme_name, _GENERIC_PATTERN )
333 main_selectors = __.typx.cast(
334 __.cabc.Sequence[ str ], patterns[ 'main_content_selectors' ] )
335 for selector in main_selectors:
336 container = soup.select_one( selector )
337 if container: return container
338 return __.absent