Coverage for sources/librovore/structures/mkdocs/extraction.py: 10%
130 statements
« prev ^ index » next coverage.py v7.10.4, created at 2025-08-20 22:48 +0000
« prev ^ index » next coverage.py v7.10.4, created at 2025-08-20 22:48 +0000
1# vim: set filetype=python fileencoding=utf-8:
2# -*- coding: utf-8 -*-
4#============================================================================#
5# #
6# Licensed under the Apache License, Version 2.0 (the "License"); #
7# you may not use this file except in compliance with the License. #
8# You may obtain a copy of the License at #
9# #
10# http://www.apache.org/licenses/LICENSE-2.0 #
11# #
12# Unless required by applicable law or agreed to in writing, software #
13# distributed under the License is distributed on an "AS IS" BASIS, #
14# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. #
15# See the License for the specific language governing permissions and #
16# limitations under the License. #
17# #
18#============================================================================#
21''' MkDocs documentation content extraction and processing. '''
24from bs4 import BeautifulSoup as _BeautifulSoup
26from . import __
29MATERIAL_THEME_PATTERNS: __.cabc.Mapping[
30 str, __.cabc.Mapping[ str, __.typx.Any ]
31] = __.immut.Dictionary( {
32 'material': __.immut.Dictionary( {
33 'main_content_selectors': [
34 'article[role="main"]',
35 '.md-content__inner',
36 '.md-typeset',
37 'main .md-content',
38 ],
39 'api_section_selectors': [
40 '.doc.doc-object-member',
41 '.doc.doc-children',
42 'section[id]',
43 '.highlight',
44 ],
45 'signature_selectors': [
46 '.doc-heading',
47 '.highlight .n',
48 'h1, h2, h3, h4, h5, h6',
49 'code',
50 ],
51 'description_selectors': [
52 '.doc-contents',
53 '.doc-object-member .doc-contents',
54 'p',
55 '.admonition',
56 ],
57 'cleanup_selectors': [
58 '.md-nav',
59 '.md-header',
60 '.md-footer',
61 '.md-sidebar',
62 '.headerlink',
63 '.md-clipboard',
64 'a.md-top',
65 ],
66 'code_block_selectors': [
67 '.highlight',
68 'pre code',
69 '.codehilite',
70 ],
71 } ),
72 'readthedocs': __.immut.Dictionary( {
73 'main_content_selectors': [
74 '.wy-nav-content-wrap main',
75 '.document',
76 '[role="main"]',
77 ],
78 'api_section_selectors': [
79 '.section',
80 'dl.class',
81 'dl.function',
82 'dl.method',
83 ],
84 'signature_selectors': [
85 'dt',
86 '.descname',
87 '.sig-name',
88 ],
89 'description_selectors': [
90 'dd',
91 '.field-body',
92 'p',
93 ],
94 'cleanup_selectors': [
95 '.headerlink',
96 '.wy-nav-top',
97 '.wy-nav-side',
98 ],
99 'code_block_selectors': [
100 '.highlight',
101 'pre',
102 ],
103 } ),
104} )
106_GENERIC_PATTERN = __.immut.Dictionary( {
107 'main_content_selectors': [
108 'article[role="main"]',
109 'main',
110 '.content',
111 '.document',
112 'body',
113 ],
114 'api_section_selectors': [
115 'section[id]',
116 'div[id]',
117 '.doc-object-member',
118 'dl',
119 ],
120 'signature_selectors': [
121 'h1, h2, h3, h4, h5, h6',
122 'dt',
123 'code',
124 '.highlight',
125 ],
126 'description_selectors': [
127 'p',
128 'dd',
129 '.description',
130 '.doc-contents',
131 ],
132 'cleanup_selectors': [
133 '.headerlink',
134 'nav',
135 'header',
136 'footer',
137 '.sidebar',
138 ],
139 'code_block_selectors': [
140 '.highlight',
141 'pre',
142 'code',
143 ],
144} )
147async def extract_contents(
148 auxdata: __.ApplicationGlobals,
149 source: str,
150 objects: __.cabc.Sequence[ __.cabc.Mapping[ str, __.typx.Any ] ], /, *,
151 theme: __.Absential[ str ] = __.absent,
152 include_snippets: bool = True,
153) -> list[ dict[ str, __.typx.Any ] ]:
154 ''' Extracts documentation content for specified objects from MkDocs. '''
155 base_url = __.normalize_base_url( source )
156 if not objects: return [ ]
157 tasks = [
158 _extract_object_documentation(
159 auxdata, base_url, dict( obj ), include_snippets, theme )
160 for obj in objects ]
161 candidate_results = await __.asyncf.gather_async(
162 *tasks, return_exceptions = True )
163 results: list[ dict[ str, __.typx.Any ] ] = [
164 dict( result.value ) for result in candidate_results
165 if __.generics.is_value( result ) and result.value is not None ]
166 return results
169def parse_mkdocs_html(
170 content: str, element_id: str, url: str, *,
171 theme: __.Absential[ str ] = __.absent
172) -> __.cabc.Mapping[ str, str ]:
173 ''' Parses MkDocs HTML content to extract documentation sections. '''
174 try: soup = _BeautifulSoup( content, 'lxml' )
175 except Exception as exc:
176 raise __.DocumentationParseFailure( element_id, exc ) from exc
177 main_container = _find_main_content_container( soup, theme )
178 if __.is_absent( main_container ):
179 raise __.DocumentationContentAbsence( element_id )
180 target_element = _find_target_element( main_container, element_id )
181 if not target_element:
182 raise __.DocumentationObjectAbsence( element_id, url )
183 signature, description = _extract_content_from_element(
184 target_element, element_id, theme )
185 return {
186 'signature': signature,
187 'description': description,
188 'object_name': element_id,
189 }
192def _clean_extracted_text( text: str ) -> str:
193 ''' Cleans extracted text while preserving meaningful structure. '''
194 text = text.strip( )
195 text = __.re.sub( r' +', ' ', text )
196 return __.re.sub( r'\n\s*\n', '\n\n', text )
199def _cleanup_content(
200 content: str,
201 cleanup_selectors: __.cabc.Sequence[ str ]
202) -> str:
203 ''' Removes unwanted elements from content. '''
204 # TODO: Implement more sophisticated cleanup
205 return content
208def _convert_to_markdown( html_content: str ) -> str:
209 ''' Converts HTML content to markdown format using markdownify. '''
210 import markdownify
211 return markdownify.markdownify( html_content, heading_style = 'ATX' )
214def _derive_documentation_url(
215 base_url: __.typx.Any, uri: str, object_name: str
216) -> __.typx.Any:
217 ''' Derives documentation URL from base URL and object URI. '''
218 if uri.endswith( '#$' ):
219 # mkdocstrings pattern - replace #$ with object name anchor
220 clean_uri = uri[ :-2 ]
221 new_path = f"{base_url.path}/{clean_uri}"
222 return base_url._replace( path = new_path, fragment = object_name )
223 if '#' in uri:
224 path_part, fragment = uri.split( '#', 1 )
225 new_path = f"{base_url.path}/{path_part}"
226 return base_url._replace( path = new_path, fragment = fragment )
227 new_path = f"{base_url.path}/{uri}"
228 return base_url._replace( path = new_path, fragment = object_name )
231def _extract_content_from_element(
232 element: __.typx.Any,
233 element_id: str,
234 theme: __.Absential[ str ] = __.absent
235) -> tuple[ str, str ]:
236 ''' Extracts signature and description content from element. '''
237 theme_name = theme if not __.is_absent( theme ) else 'material'
238 patterns = MATERIAL_THEME_PATTERNS.get( theme_name, _GENERIC_PATTERN )
239 signature = _extract_signature( element, patterns )
240 description = _extract_description( element, patterns )
241 cleanup_selectors = __.typx.cast(
242 __.cabc.Sequence[ str ], patterns[ 'cleanup_selectors' ] )
243 description = _cleanup_content( description, cleanup_selectors )
244 return signature, description
247def _extract_description(
248 element: __.typx.Any,
249 patterns: __.cabc.Mapping[ str, __.typx.Any ]
250) -> str:
251 ''' Extracts description content from element. '''
252 doc_contents = _find_doc_contents_container( element )
253 if doc_contents:
254 return doc_contents.decode_contents( )
255 descriptions = _extract_using_fallback_selectors( element, patterns )
256 return '\n\n'.join( descriptions ) if descriptions else ''
259async def _extract_object_documentation(
260 auxdata: __.ApplicationGlobals,
261 base_url: __.typx.Any,
262 obj: dict[ str, __.typx.Any ],
263 include_snippets: bool,
264 theme: __.Absential[ str ] = __.absent
265) -> dict[ str, __.typx.Any ] | None:
266 ''' Extracts documentation for a single object from MkDocs site. '''
267 doc_url = _derive_documentation_url(
268 base_url, obj[ 'uri' ], obj[ 'name' ] )
269 try:
270 html_content = (
271 await __.retrieve_url_as_text(
272 auxdata.content_cache, doc_url ) )
273 except Exception as exc:
274 __.acquire_scribe( __name__ ).debug(
275 "Failed to retrieve %s: %s", doc_url, exc )
276 return None
277 anchor = doc_url.fragment or str( obj[ 'name' ] )
278 try:
279 parsed_content = parse_mkdocs_html(
280 html_content, anchor, str( doc_url ), theme = theme )
281 except Exception: return None
282 description = _convert_to_markdown( parsed_content[ 'description' ] )
283 snippet_max_length = 200
284 if include_snippets:
285 content_snippet = (
286 description[ : snippet_max_length ] + '...'
287 if len( description ) > snippet_max_length
288 else description )
289 else: content_snippet = ''
290 return {
291 'object_name': obj[ 'name' ],
292 'object_type': obj[ 'role' ],
293 'domain': obj[ 'domain' ],
294 'priority': obj[ 'priority' ],
295 'url': doc_url.geturl( ),
296 'signature': parsed_content[ 'signature' ],
297 'description': description,
298 'content_snippet': content_snippet,
299 'relevance_score': 1.0,
300 'match_reasons': [ 'direct extraction' ],
301 }
304def _extract_paragraphs_from_doc_contents(
305 doc_contents: __.typx.Any
306) -> list[ str ]:
307 ''' Legacy function - now unused after markdownify migration. '''
308 # This function is kept for backward compatibility but is no longer used
309 # since we now extract the full doc-contents HTML in _extract_description
310 descriptions: list[ str ] = [ ]
311 for child in doc_contents.children:
312 if hasattr( child, 'name' ):
313 if (
314 child.name == 'div' and
315 'admonition' in child.get( 'class', [ ] )
316 ): continue
317 if child.name == 'p':
318 html_content = str( child )
319 if html_content and html_content not in descriptions:
320 descriptions.append( html_content )
321 return descriptions
324def _extract_signature(
325 element: __.typx.Any,
326 patterns: __.cabc.Mapping[ str, __.typx.Any ]
327) -> str:
328 ''' Extracts signature/heading content from element. '''
329 signature_selectors = __.typx.cast(
330 __.cabc.Sequence[ str ], patterns[ 'signature_selectors' ] )
331 for selector in signature_selectors:
332 signature_elem = element.select_one( selector )
333 if signature_elem:
334 return _clean_extracted_text( signature_elem.get_text( ) )
335 return _clean_extracted_text( element.get_text( ) )
338def _extract_using_fallback_selectors(
339 element: __.typx.Any,
340 patterns: __.cabc.Mapping[ str, __.typx.Any ]
341) -> list[ str ]:
342 ''' Extracts description using fallback selectors. '''
343 descriptions: list[ str ] = [ ]
344 description_selectors = __.typx.cast(
345 __.cabc.Sequence[ str ], patterns[ 'description_selectors' ] )
346 for selector in description_selectors:
347 desc_elements = element.select( selector )
348 for desc_elem in desc_elements:
349 if (
350 desc_elem.get( 'class' ) and
351 'admonition-title' in desc_elem.get( 'class', [ ] )
352 ): continue
353 html_content = str( desc_elem )
354 if html_content and html_content not in descriptions:
355 descriptions.append( html_content )
356 return descriptions
359def _find_doc_contents_container( element: __.typx.Any ) -> __.typx.Any | None:
360 ''' Finds the doc-contents container for the element. '''
361 if element.name in ( 'h1', 'h2', 'h3', 'h4', 'h5', 'h6' ):
362 sibling = element.next_sibling
363 while sibling:
364 if (
365 hasattr( sibling, 'get' ) and sibling.name == 'div' and
366 'doc-contents' in sibling.get( 'class', [ ] )
367 ): return sibling
368 sibling = sibling.next_sibling
369 return element.select_one( '.doc-contents' )
372def _find_target_element(
373 container: __.typx.Any, element_id: str
374) -> __.typx.Any:
375 ''' Finds target element within main container using ID strategies. '''
376 target = container.find( id = element_id )
377 if target: return target
378 target = container.find( attrs = { 'data-toc-label': element_id } )
379 if target: return target
380 for heading in container.find_all(
381 [ 'h1', 'h2', 'h3', 'h4', 'h5', 'h6' ] ):
382 if element_id in heading.get_text( ):
383 return heading
384 for section in container.find_all( 'section' ):
385 class_attr = section.get( 'class' )
386 if class_attr and element_id in ' '.join( class_attr ):
387 return section
388 return container
391def _find_main_content_container(
392 soup: __.typx.Any, theme: __.Absential[ str ] = __.absent
393) -> __.Absential[ __.typx.Any ]:
394 ''' Finds main content container using theme-specific strategies. '''
395 theme_name = theme if not __.is_absent( theme ) else 'material'
396 patterns = MATERIAL_THEME_PATTERNS.get( theme_name, _GENERIC_PATTERN )
397 main_selectors = __.typx.cast(
398 __.cabc.Sequence[ str ], patterns[ 'main_content_selectors' ] )
399 for selector in main_selectors:
400 container = soup.select_one( selector )
401 if container: return container
402 return __.absent