Coverage for sources/librovore/structures/sphinx/extraction.py: 11%
139 statements
« prev ^ index » next coverage.py v7.10.4, created at 2025-08-17 23:43 +0000
« prev ^ index » next coverage.py v7.10.4, created at 2025-08-17 23:43 +0000
1# vim: set filetype=python fileencoding=utf-8:
2# -*- coding: utf-8 -*-
4#============================================================================#
5# #
6# Licensed under the Apache License, Version 2.0 (the "License"); #
7# you may not use this file except in compliance with the License. #
8# You may obtain a copy of the License at #
9# #
10# http://www.apache.org/licenses/LICENSE-2.0 #
11# #
12# Unless required by applicable law or agreed to in writing, software #
13# distributed under the License is distributed on an "AS IS" BASIS, #
14# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. #
15# See the License for the specific language governing permissions and #
16# limitations under the License. #
17# #
18#============================================================================#
21''' Documentation extraction and content retrieval. '''
24from bs4 import BeautifulSoup as _BeautifulSoup
26from . import __
27from . import urls as _urls
30_scribe = __.acquire_scribe( __name__ )
33# Theme-specific content extraction patterns
34THEME_EXTRACTION_PATTERNS: __.cabc.Mapping[
35 str, __.cabc.Mapping[ str, __.typx.Any ]
36] = __.immut.Dictionary( {
37 'pydoctheme': __.immut.Dictionary( {
38 'anchor_elements': [ 'dt', 'a' ],
39 'content_strategies': __.immut.Dictionary( {
40 'dt': __.immut.Dictionary( {
41 'signature_source': 'self',
42 'description_source': 'next_sibling',
43 'description_element': 'dd',
44 } ),
45 'a': __.immut.Dictionary( {
46 'signature_source': 'parent_text',
47 'description_source': 'parent_next_sibling',
48 'description_element': 'dd',
49 } ),
50 } ),
51 'cleanup_selectors': [ 'a.headerlink' ],
52 } ),
53 'furo': __.immut.Dictionary( {
54 'anchor_elements': [ 'span', 'a', 'dt' ],
55 'content_strategies': __.immut.Dictionary( {
56 'span': __.immut.Dictionary( {
57 'signature_source': 'parent_header',
58 'description_source': 'parent_next_element',
59 'description_element': 'p',
60 'fallback_container': 'section',
61 } ),
62 'a': __.immut.Dictionary( {
63 'signature_source': 'parent_text',
64 'description_source': 'parent_next_element',
65 'description_element': 'p',
66 } ),
67 'dt': __.immut.Dictionary( {
68 'signature_source': 'self',
69 'description_source': 'next_sibling',
70 'description_element': 'dd',
71 } ),
72 } ),
73 'cleanup_selectors': [ 'a.headerlink', '.highlight' ],
74 } ),
75 'sphinx_rtd_theme': __.immut.Dictionary( {
76 'anchor_elements': [ 'dt', 'span', 'a' ],
77 'content_strategies': __.immut.Dictionary( {
78 'dt': __.immut.Dictionary( {
79 'signature_source': 'self',
80 'description_source': 'next_sibling',
81 'description_element': 'dd',
82 } ),
83 'span': __.immut.Dictionary( {
84 'signature_source': 'parent_header',
85 'description_source': 'parent_content',
86 'description_element': 'p',
87 } ),
88 } ),
89 'cleanup_selectors': [ 'a.headerlink' ],
90 } ),
91} )
93# Generic fallback pattern for unknown themes
94_GENERIC_PATTERN = __.immut.Dictionary( {
95 'anchor_elements': [ 'dt', 'span', 'a', 'section', 'div' ],
96 'content_strategies': __.immut.Dictionary( {
97 'dt': __.immut.Dictionary( {
98 'signature_source': 'self',
99 'description_source': 'next_sibling',
100 'description_element': 'dd',
101 } ),
102 'section': __.immut.Dictionary( {
103 'signature_source': 'first_header',
104 'description_source': 'first_paragraph',
105 'description_element': 'p',
106 } ),
107 'span': __.immut.Dictionary( {
108 'signature_source': 'parent_header',
109 'description_source': 'parent_next_element',
110 'description_element': 'p',
111 } ),
112 'a': __.immut.Dictionary( {
113 'signature_source': 'parent_text',
114 'description_source': 'parent_next_element',
115 'description_element': 'p',
116 } ),
117 } ),
118 'cleanup_selectors': [ 'a.headerlink' ],
119} )
122async def extract_contents(
123 auxdata: __.ApplicationGlobals,
124 source: str,
125 objects: __.cabc.Sequence[ __.cabc.Mapping[ str, __.typx.Any ] ], /, *,
126 theme: __.Absential[ str ] = __.absent,
127 include_snippets: bool = True,
128) -> list[ dict[ str, __.typx.Any ] ]:
129 ''' Extracts documentation content for specified objects. '''
130 base_url = _urls.normalize_base_url( source )
131 if not objects: return [ ]
132 tasks = [
133 _extract_object_documentation(
134 auxdata, base_url, dict( obj ), include_snippets, theme )
135 for obj in objects ]
136 candidate_results = await __.asyncf.gather_async(
137 *tasks, return_exceptions = True )
138 results: list[ dict[ str, __.typx.Any ] ] = [
139 dict( result.value ) for result in candidate_results
140 if __.generics.is_value( result ) and result.value is not None ]
141 return results
144def parse_documentation_html(
145 content: str, element_id: str, url: str, *,
146 theme: __.Absential[ str ] = __.absent
147) -> __.cabc.Mapping[ str, str ]:
148 ''' Parses HTML content to extract documentation sections. '''
149 try: soup = _BeautifulSoup( content, 'lxml' )
150 except Exception as exc:
151 raise __.DocumentationParseFailure(
152 element_id, exc ) from exc
153 # Theme should be provided from detection metadata
154 # If absent, use None to fall back to generic detection
155 container = _find_main_content_container( soup, theme )
156 if __.is_absent( container ):
157 raise __.DocumentationContentAbsence( element_id )
158 element = container.find( id = element_id )
159 if not element:
160 raise __.DocumentationObjectAbsence( element_id, url )
161 signature, description = _extract_content_with_dsl(
162 element, element_id, theme )
163 return {
164 'signature': signature,
165 'description': description,
166 'object_name': element_id,
167 }
170def _cleanup_content(
171 content: str,
172 cleanup_selectors: __.cabc.Sequence[ str ]
173) -> str:
174 ''' Removes unwanted elements from content using CSS selectors. '''
175 # TODO: Implement CSS selector-based cleanup
176 return content
179def _extract_content_with_dsl(
180 element: __.typx.Any,
181 element_id: str,
182 theme: __.Absential[ str ] = __.absent
183) -> tuple[ str, str ]:
184 ''' Extracts content using DSL pattern configuration. '''
185 theme_name = theme if not __.is_absent( theme ) else None
186 if theme_name is not None:
187 pattern = THEME_EXTRACTION_PATTERNS.get( theme_name, _GENERIC_PATTERN )
188 else: pattern = _GENERIC_PATTERN
189 content_strategies = __.typx.cast(
190 __.cabc.Mapping[ str, __.cabc.Mapping[ str, __.typx.Any ] ],
191 pattern[ 'content_strategies' ] )
192 strategy = content_strategies.get( element.name )
193 if not strategy: return _generic_extraction( element )
194 signature = _extract_signature_with_strategy( element, strategy )
195 description = _extract_description_with_strategy( element, strategy )
196 if 'cleanup_selectors' in pattern:
197 cleanup_selectors = __.typx.cast(
198 __.cabc.Sequence[ str ], pattern[ 'cleanup_selectors' ] )
199 description = _cleanup_content( description, cleanup_selectors )
200 return signature, description
203def _extract_description_with_strategy(
204 element: __.typx.Any,
205 strategy: __.cabc.Mapping[ str, __.typx.Any ]
206) -> str:
207 ''' Extracts description using DSL strategy. '''
208 source_type = __.typx.cast( str, strategy[ 'description_source' ] )
209 element_type = __.typx.cast(
210 str, strategy.get( 'description_element', 'p' ) )
211 return _get_description_by_source_type(
212 element, source_type, element_type )
215async def _extract_object_documentation(
216 auxdata: __.ApplicationGlobals,
217 base_url: __.typx.Any,
218 obj: dict[ str, __.typx.Any ],
219 include_snippets: bool,
220 theme: __.Absential[ str ] = __.absent
221) -> dict[ str, __.typx.Any ] | None:
222 ''' Extracts documentation for a single object. '''
223 from . import conversion as _conversion
224 doc_url = _urls.derive_documentation_url(
225 base_url, obj[ 'uri' ], obj[ 'name' ] )
226 try:
227 html_content = (
228 await __.retrieve_url_as_text(
229 auxdata.content_cache, doc_url ) )
230 except Exception as exc:
231 _scribe.debug( "Failed to retrieve %s: %s", doc_url, exc )
232 return None
233 anchor = doc_url.fragment or str( obj[ 'name' ] )
234 try:
235 parsed_content = parse_documentation_html(
236 html_content, anchor, str( doc_url ), theme = theme )
237 except Exception: return None
238 description = _conversion.html_to_markdown(
239 parsed_content[ 'description' ] )
240 snippet_max_length = 200
241 if include_snippets:
242 content_snippet = (
243 description[ : snippet_max_length ] + '...'
244 if len( description ) > snippet_max_length
245 else description )
246 else: content_snippet = ''
247 return {
248 'object_name': obj[ 'name' ],
249 'object_type': obj[ 'role' ],
250 'domain': obj[ 'domain' ],
251 'priority': obj[ 'priority' ],
252 'url': doc_url.geturl( ),
253 'signature': parsed_content[ 'signature' ],
254 'description': description,
255 'content_snippet': content_snippet,
256 'relevance_score': 1.0,
257 'match_reasons': [ 'direct extraction' ],
258 }
261def _extract_signature_with_strategy(
262 element: __.typx.Any,
263 strategy: __.cabc.Mapping[ str, __.typx.Any ]
264) -> str:
265 ''' Extracts signature using DSL strategy. '''
266 source_type = __.typx.cast( str, strategy[ 'signature_source' ] )
267 match source_type:
268 case 'self': return _clean_extracted_text( element.get_text( ) )
269 case 'parent_text':
270 return (
271 _clean_extracted_text( element.parent.get_text( ) )
272 if element.parent else '' )
273 case 'parent_header':
274 if element.parent:
275 header = element.parent.find(
276 [ 'h1', 'h2', 'h3', 'h4', 'h5', 'h6' ] )
277 return (
278 _clean_extracted_text( header.get_text( ) )
279 if header else '' )
280 return ''
281 case 'first_header':
282 header = element.find(
283 [ 'h1', 'h2', 'h3', 'h4', 'h5', 'h6' ] )
284 return (
285 _clean_extracted_text( header.get_text( ) )
286 if header else '' )
287 case _: return _clean_extracted_text( element.get_text( ) )
290def _find_main_content_container(
291 soup: __.typx.Any, theme: __.Absential[ str ] = __.absent
292) -> __.Absential[ __.typx.Any ]:
293 ''' Finds the main content container using theme-specific strategies. '''
294 if theme == 'furo':
295 containers = [
296 soup.find( 'article', { 'role': 'main' } ),
297 soup.find( 'div', { 'id': 'furo-main-content' } ),
298 ]
299 elif theme == 'sphinx_rtd_theme':
300 containers = [
301 soup.find( 'div', { 'class': 'document' } ),
302 soup.find( 'div', { 'class': 'body' } ),
303 soup.find( 'div', { 'role': 'main' } ),
304 ]
305 elif theme == 'pydoctheme': # Python docs
306 containers = [
307 soup.find( 'div', { 'class': 'body' } ),
308 soup.find( 'div', { 'class': 'content' } ),
309 soup.body, # Python docs often use body directly
310 ]
311 elif theme == 'flask': # Flask docs
312 containers = [
313 soup.find( 'div', { 'class': 'body' } ),
314 soup.find( 'div', { 'class': 'content' } ),
315 soup.body,
316 ]
317 elif theme == 'alabaster':
318 containers = [
319 soup.find( 'div', { 'class': 'body' } ),
320 soup.find( 'div', { 'class': 'content' } ),
321 ]
322 else: # Generic fallback for unknown themes
323 containers = [
324 soup.find( 'article', { 'role': 'main' } ), # Furo theme
325 soup.find( 'div', { 'class': 'body' } ), # Basic theme
326 soup.find( 'div', { 'class': 'content' } ), # Nature theme
327 soup.find( 'div', { 'class': 'main' } ), # Generic main
328 soup.find( 'main' ), # HTML5 main element
329 soup.find( 'div', { 'role': 'main' } ), # Role-based
330 soup.body, # Fallback to body if nothing else works
331 ]
332 for container in containers:
333 if container: return container
334 return __.absent
337def _clean_extracted_text( text: str ) -> str:
338 ''' Cleans extracted text while preserving internal spacing. '''
339 # Remove leading/trailing whitespace but preserve internal spaces
340 text = text.strip( )
341 # Normalize multiple spaces to single spaces
342 text = __.re.sub( r' +', ' ', text )
343 # Remove excessive newlines but preserve paragraph breaks
344 return __.re.sub( r'\n\s*\n', '\n\n', text )
347def _generic_extraction( element: __.typx.Any ) -> tuple[ str, str ]:
348 ''' Generic fallback extraction for unknown element types. '''
349 signature = _clean_extracted_text( element.get_text( ) )
350 description = ''
351 if element.parent:
352 next_p = element.parent.find( 'p' )
353 if next_p:
354 description = str( next_p )
355 return signature, description
358def _get_description_by_source_type(
359 element: __.typx.Any,
360 source_type: str,
361 element_type: str
362) -> str:
363 ''' Gets description content based on source type. '''
364 match source_type:
365 case 'next_sibling':
366 return _get_sibling_text( element, element_type )
367 case 'parent_next_sibling':
368 return _get_parent_sibling_text( element, element_type )
369 case 'parent_next_element':
370 return _get_parent_element_text( element, element_type )
371 case 'parent_content':
372 return _get_parent_content_text( element, element_type )
373 case 'first_paragraph':
374 return _get_first_paragraph_text( element )
375 case _: return ''
378def _get_first_paragraph_text( element: __.typx.Any ) -> str:
379 ''' Gets HTML content from first paragraph within element. '''
380 paragraph = element.find( 'p' )
381 return str( paragraph ) if paragraph else ''
384def _get_parent_content_text( element: __.typx.Any, element_type: str ) -> str:
385 ''' Gets HTML content from content element within parent. '''
386 if element.parent:
387 content_elem = element.parent.find( element_type )
388 return content_elem.decode_contents( ) if content_elem else ''
389 return ''
392def _get_parent_element_text( element: __.typx.Any, element_type: str ) -> str:
393 ''' Gets HTML content from element within parent. '''
394 if element.parent:
395 next_elem = element.parent.find( element_type )
396 return next_elem.decode_contents( ) if next_elem else ''
397 return ''
400def _get_parent_sibling_text( element: __.typx.Any, element_type: str ) -> str:
401 ''' Gets HTML content from parent's next sibling element. '''
402 if element.parent:
403 sibling = element.parent.find_next_sibling( element_type )
404 return sibling.decode_contents( ) if sibling else ''
405 return ''
408def _get_sibling_text( element: __.typx.Any, element_type: str ) -> str:
409 ''' Gets HTML content from next sibling element. '''
410 sibling = element.find_next_sibling( element_type )
411 return sibling.decode_contents( ) if sibling else ''