Coverage for sources/librovore/structures/mkdocs/extraction.py: 11%
120 statements
« prev ^ index » next coverage.py v7.10.5, created at 2025-08-29 01:14 +0000
« prev ^ index » next coverage.py v7.10.5, created at 2025-08-29 01:14 +0000
1# vim: set filetype=python fileencoding=utf-8:
2# -*- coding: utf-8 -*-
4#============================================================================#
5# #
6# Licensed under the Apache License, Version 2.0 (the "License"); #
7# you may not use this file except in compliance with the License. #
8# You may obtain a copy of the License at #
9# #
10# http://www.apache.org/licenses/LICENSE-2.0 #
11# #
12# Unless required by applicable law or agreed to in writing, software #
13# distributed under the License is distributed on an "AS IS" BASIS, #
14# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. #
15# See the License for the specific language governing permissions and #
16# limitations under the License. #
17# #
18#============================================================================#
21''' MkDocs documentation content extraction and processing. '''
24from bs4 import BeautifulSoup as _BeautifulSoup
26from . import __
29MATERIAL_THEME_PATTERNS: __.cabc.Mapping[
30 str, __.cabc.Mapping[ str, __.typx.Any ]
31] = __.immut.Dictionary( {
32 'material': __.immut.Dictionary( {
33 'main_content_selectors': [
34 'article[role="main"]',
35 '.md-content__inner',
36 '.md-typeset',
37 'main .md-content',
38 ],
39 'api_section_selectors': [
40 '.doc.doc-object-member',
41 '.doc.doc-children',
42 'section[id]',
43 '.highlight',
44 ],
45 'signature_selectors': [
46 '.doc-heading',
47 '.highlight .n',
48 'h1, h2, h3, h4, h5, h6',
49 'code',
50 ],
51 'description_selectors': [
52 '.doc-contents',
53 '.doc-object-member .doc-contents',
54 'p',
55 '.admonition',
56 ],
57 'cleanup_selectors': [
58 '.md-nav',
59 '.md-header',
60 '.md-footer',
61 '.md-sidebar',
62 '.headerlink',
63 '.md-clipboard',
64 'a.md-top',
65 ],
66 'code_block_selectors': [
67 '.highlight',
68 'pre code',
69 '.codehilite',
70 ],
71 } ),
72 'readthedocs': __.immut.Dictionary( {
73 'main_content_selectors': [
74 '.wy-nav-content-wrap main',
75 '.document',
76 '[role="main"]',
77 ],
78 'api_section_selectors': [
79 '.section',
80 'dl.class',
81 'dl.function',
82 'dl.method',
83 ],
84 'signature_selectors': [
85 'dt',
86 '.descname',
87 '.sig-name',
88 ],
89 'description_selectors': [
90 'dd',
91 '.field-body',
92 'p',
93 ],
94 'cleanup_selectors': [
95 '.headerlink',
96 '.wy-nav-top',
97 '.wy-nav-side',
98 ],
99 'code_block_selectors': [
100 '.highlight',
101 'pre',
102 ],
103 } ),
104} )
106_GENERIC_PATTERN = __.immut.Dictionary( {
107 'main_content_selectors': [
108 'article[role="main"]',
109 'main',
110 '.content',
111 '.document',
112 'body',
113 ],
114 'api_section_selectors': [
115 'section[id]',
116 'div[id]',
117 '.doc-object-member',
118 'dl',
119 ],
120 'signature_selectors': [
121 'h1, h2, h3, h4, h5, h6',
122 'dt',
123 'code',
124 '.highlight',
125 ],
126 'description_selectors': [
127 'p',
128 'dd',
129 '.description',
130 '.doc-contents',
131 ],
132 'cleanup_selectors': [
133 '.headerlink',
134 'nav',
135 'header',
136 'footer',
137 '.sidebar',
138 ],
139 'code_block_selectors': [
140 '.highlight',
141 'pre',
142 'code',
143 ],
144} )
147async def extract_contents(
148 auxdata: __.ApplicationGlobals,
149 source: str,
150 objects: __.cabc.Sequence[ __.InventoryObject ], /, *,
151 theme: __.Absential[ str ] = __.absent,
152 include_snippets: bool = True,
153) -> list[ __.ContentDocument ]:
154 ''' Extracts documentation content for specified objects from MkDocs. '''
155 base_url = __.normalize_base_url( source )
156 if not objects: return [ ]
157 tasks = [
158 _extract_object_documentation(
159 auxdata, base_url, obj, include_snippets, theme )
160 for obj in objects ]
161 candidate_results = await __.asyncf.gather_async(
162 *tasks, return_exceptions = True )
163 results: list[ __.ContentDocument ] = [
164 result.value for result in candidate_results
165 if __.generics.is_value( result ) and result.value is not None ]
166 return results
169def parse_mkdocs_html(
170 content: str, element_id: str, url: str, *,
171 theme: __.Absential[ str ] = __.absent
172) -> __.cabc.Mapping[ str, str ]:
173 ''' Parses MkDocs HTML content to extract documentation sections. '''
174 try: soup = _BeautifulSoup( content, 'lxml' )
175 except Exception as exc:
176 raise __.DocumentationParseFailure( element_id, exc ) from exc
177 main_container = _find_main_content_container( soup, theme )
178 if __.is_absent( main_container ):
179 raise __.DocumentationContentAbsence( element_id )
180 target_element = _find_target_element( main_container, element_id )
181 if not target_element:
182 raise __.DocumentationObjectAbsence( element_id, url )
183 signature, description = _extract_content_from_element(
184 target_element, element_id, theme )
185 return {
186 'signature': signature,
187 'description': description,
188 'object_name': element_id,
189 }
192def _clean_extracted_text( text: str ) -> str:
193 ''' Cleans extracted text while preserving meaningful structure. '''
194 text = text.strip( )
195 text = __.re.sub( r' +', ' ', text )
196 return __.re.sub( r'\n\s*\n', '\n\n', text )
199def _cleanup_content(
200 content: str,
201 cleanup_selectors: __.cabc.Sequence[ str ]
202) -> str:
203 ''' Removes unwanted elements from content. '''
204 # TODO: Implement more sophisticated cleanup
205 return content
208def _convert_to_markdown( html_content: str ) -> str:
209 ''' Converts HTML content to markdown format using markdownify. '''
210 import markdownify
211 return markdownify.markdownify( html_content, heading_style = 'ATX' )
214def _derive_documentation_url(
215 base_url: __.typx.Any, uri: str, object_name: str
216) -> __.typx.Any:
217 ''' Derives documentation URL from base URL and object URI. '''
218 if uri.endswith( '#$' ):
219 # mkdocstrings pattern - replace #$ with object name anchor
220 clean_uri = uri[ :-2 ]
221 new_path = f"{base_url.path}/{clean_uri}"
222 return base_url._replace( path = new_path, fragment = object_name )
223 if '#' in uri:
224 path_part, fragment = uri.split( '#', 1 )
225 new_path = f"{base_url.path}/{path_part}"
226 return base_url._replace( path = new_path, fragment = fragment )
227 new_path = f"{base_url.path}/{uri}"
228 return base_url._replace( path = new_path, fragment = object_name )
231def _extract_content_from_element(
232 element: __.typx.Any,
233 element_id: str,
234 theme: __.Absential[ str ] = __.absent
235) -> tuple[ str, str ]:
236 ''' Extracts signature and description content from element. '''
237 theme_name = theme if not __.is_absent( theme ) else 'material'
238 patterns = MATERIAL_THEME_PATTERNS.get( theme_name, _GENERIC_PATTERN )
239 signature = _extract_signature( element, patterns )
240 description = _extract_description( element, patterns )
241 cleanup_selectors = __.typx.cast(
242 __.cabc.Sequence[ str ], patterns[ 'cleanup_selectors' ] )
243 description = _cleanup_content( description, cleanup_selectors )
244 return signature, description
247def _extract_description(
248 element: __.typx.Any,
249 patterns: __.cabc.Mapping[ str, __.typx.Any ]
250) -> str:
251 ''' Extracts description content from element. '''
252 doc_contents = _find_doc_contents_container( element )
253 if doc_contents:
254 return doc_contents.decode_contents( )
255 descriptions = _extract_using_fallback_selectors( element, patterns )
256 return '\n\n'.join( descriptions ) if descriptions else ''
259async def _extract_object_documentation(
260 auxdata: __.ApplicationGlobals,
261 base_url: __.typx.Any,
262 obj: __.InventoryObject,
263 include_snippets: bool,
264 theme: __.Absential[ str ] = __.absent
265) -> __.ContentDocument | None:
266 ''' Extracts documentation for a single object from MkDocs site. '''
267 doc_url = _derive_documentation_url(
268 base_url, obj.uri, obj.name )
269 try:
270 html_content = (
271 await __.retrieve_url_as_text(
272 auxdata.content_cache, doc_url ) )
273 except Exception as exc:
274 __.acquire_scribe( __name__ ).debug(
275 "Failed to retrieve %s: %s", doc_url, exc )
276 return None
277 anchor = doc_url.fragment or str( obj.name )
278 try:
279 parsed_content = parse_mkdocs_html(
280 html_content, anchor, str( doc_url ), theme = theme )
281 except Exception: return None
282 description = _convert_to_markdown( parsed_content[ 'description' ] )
283 snippet_max_length = 200
284 if include_snippets:
285 content_snippet = (
286 description[ : snippet_max_length ] + '...'
287 if len( description ) > snippet_max_length
288 else description )
289 else: content_snippet = ''
290 return __.ContentDocument(
291 inventory_object = obj,
292 signature = parsed_content[ 'signature' ],
293 description = description,
294 content_snippet = content_snippet,
295 documentation_url = doc_url.geturl( ),
296 extraction_metadata = __.immut.Dictionary( {
297 'theme': theme if not __.is_absent( theme ) else 'unknown',
298 'extraction_method': 'mkdocs_html_parsing',
299 'relevance_score': 1.0,
300 'match_reasons': [ 'direct extraction' ],
301 } )
302 )
305def _extract_signature(
306 element: __.typx.Any,
307 patterns: __.cabc.Mapping[ str, __.typx.Any ]
308) -> str:
309 ''' Extracts signature/heading content from element. '''
310 signature_selectors = __.typx.cast(
311 __.cabc.Sequence[ str ], patterns[ 'signature_selectors' ] )
312 for selector in signature_selectors:
313 signature_elem = element.select_one( selector )
314 if signature_elem:
315 return _clean_extracted_text( signature_elem.get_text( ) )
316 return _clean_extracted_text( element.get_text( ) )
319def _extract_using_fallback_selectors(
320 element: __.typx.Any,
321 patterns: __.cabc.Mapping[ str, __.typx.Any ]
322) -> list[ str ]:
323 ''' Extracts description using fallback selectors. '''
324 descriptions: list[ str ] = [ ]
325 description_selectors = __.typx.cast(
326 __.cabc.Sequence[ str ], patterns[ 'description_selectors' ] )
327 for selector in description_selectors:
328 desc_elements = element.select( selector )
329 for desc_elem in desc_elements:
330 if (
331 desc_elem.get( 'class' ) and
332 'admonition-title' in desc_elem.get( 'class', [ ] )
333 ): continue
334 html_content = str( desc_elem )
335 if html_content and html_content not in descriptions:
336 descriptions.append( html_content )
337 return descriptions
340def _find_doc_contents_container( element: __.typx.Any ) -> __.typx.Any | None:
341 ''' Finds the doc-contents container for the element. '''
342 if element.name in ( 'h1', 'h2', 'h3', 'h4', 'h5', 'h6' ):
343 sibling = element.next_sibling
344 while sibling:
345 if (
346 hasattr( sibling, 'get' ) and sibling.name == 'div' and
347 'doc-contents' in sibling.get( 'class', [ ] )
348 ): return sibling
349 sibling = sibling.next_sibling
350 return element.select_one( '.doc-contents' )
353def _find_target_element(
354 container: __.typx.Any, element_id: str
355) -> __.typx.Any:
356 ''' Finds target element within main container using ID strategies. '''
357 target = container.find( id = element_id )
358 if target: return target
359 target = container.find( attrs = { 'data-toc-label': element_id } )
360 if target: return target
361 for heading in container.find_all(
362 [ 'h1', 'h2', 'h3', 'h4', 'h5', 'h6' ] ):
363 if element_id in heading.get_text( ):
364 return heading
365 for section in container.find_all( 'section' ):
366 class_attr = section.get( 'class' )
367 if class_attr and element_id in ' '.join( class_attr ):
368 return section
369 return container
372def _find_main_content_container(
373 soup: __.typx.Any, theme: __.Absential[ str ] = __.absent
374) -> __.Absential[ __.typx.Any ]:
375 ''' Finds main content container using theme-specific strategies. '''
376 theme_name = theme if not __.is_absent( theme ) else 'material'
377 patterns = MATERIAL_THEME_PATTERNS.get( theme_name, _GENERIC_PATTERN )
378 main_selectors = __.typx.cast(
379 __.cabc.Sequence[ str ], patterns[ 'main_content_selectors' ] )
380 for selector in main_selectors:
381 container = soup.select_one( selector )
382 if container: return container
383 return __.absent