Coverage for sources/librovore/structures/sphinx/extraction.py: 11%
139 statements
« prev ^ index » next coverage.py v7.10.5, created at 2025-08-29 01:14 +0000
« prev ^ index » next coverage.py v7.10.5, created at 2025-08-29 01:14 +0000
1# vim: set filetype=python fileencoding=utf-8:
2# -*- coding: utf-8 -*-
4#============================================================================#
5# #
6# Licensed under the Apache License, Version 2.0 (the "License"); #
7# you may not use this file except in compliance with the License. #
8# You may obtain a copy of the License at #
9# #
10# http://www.apache.org/licenses/LICENSE-2.0 #
11# #
12# Unless required by applicable law or agreed to in writing, software #
13# distributed under the License is distributed on an "AS IS" BASIS, #
14# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. #
15# See the License for the specific language governing permissions and #
16# limitations under the License. #
17# #
18#============================================================================#
21''' Documentation extraction and content retrieval. '''
24from bs4 import BeautifulSoup as _BeautifulSoup
26from . import __
27from . import urls as _urls
30_scribe = __.acquire_scribe( __name__ )
33# Theme-specific content extraction patterns
34THEME_EXTRACTION_PATTERNS: __.cabc.Mapping[
35 str, __.cabc.Mapping[ str, __.typx.Any ]
36] = __.immut.Dictionary( {
37 'pydoctheme': __.immut.Dictionary( {
38 'anchor_elements': [ 'dt', 'a' ],
39 'content_strategies': __.immut.Dictionary( {
40 'dt': __.immut.Dictionary( {
41 'signature_source': 'self',
42 'description_source': 'next_sibling',
43 'description_element': 'dd',
44 } ),
45 'a': __.immut.Dictionary( {
46 'signature_source': 'parent_text',
47 'description_source': 'parent_next_sibling',
48 'description_element': 'dd',
49 } ),
50 } ),
51 'cleanup_selectors': [ 'a.headerlink' ],
52 } ),
53 'furo': __.immut.Dictionary( {
54 'anchor_elements': [ 'span', 'a', 'dt' ],
55 'content_strategies': __.immut.Dictionary( {
56 'span': __.immut.Dictionary( {
57 'signature_source': 'parent_header',
58 'description_source': 'parent_next_element',
59 'description_element': 'p',
60 'fallback_container': 'section',
61 } ),
62 'a': __.immut.Dictionary( {
63 'signature_source': 'parent_text',
64 'description_source': 'parent_next_element',
65 'description_element': 'p',
66 } ),
67 'dt': __.immut.Dictionary( {
68 'signature_source': 'self',
69 'description_source': 'next_sibling',
70 'description_element': 'dd',
71 } ),
72 } ),
73 'cleanup_selectors': [ 'a.headerlink', '.highlight' ],
74 } ),
75 'sphinx_rtd_theme': __.immut.Dictionary( {
76 'anchor_elements': [ 'dt', 'span', 'a' ],
77 'content_strategies': __.immut.Dictionary( {
78 'dt': __.immut.Dictionary( {
79 'signature_source': 'self',
80 'description_source': 'next_sibling',
81 'description_element': 'dd',
82 } ),
83 'span': __.immut.Dictionary( {
84 'signature_source': 'parent_header',
85 'description_source': 'parent_content',
86 'description_element': 'p',
87 } ),
88 } ),
89 'cleanup_selectors': [ 'a.headerlink' ],
90 } ),
91} )
93# Generic fallback pattern for unknown themes
94_GENERIC_PATTERN = __.immut.Dictionary( {
95 'anchor_elements': [ 'dt', 'span', 'a', 'section', 'div' ],
96 'content_strategies': __.immut.Dictionary( {
97 'dt': __.immut.Dictionary( {
98 'signature_source': 'self',
99 'description_source': 'next_sibling',
100 'description_element': 'dd',
101 } ),
102 'section': __.immut.Dictionary( {
103 'signature_source': 'first_header',
104 'description_source': 'first_paragraph',
105 'description_element': 'p',
106 } ),
107 'span': __.immut.Dictionary( {
108 'signature_source': 'parent_header',
109 'description_source': 'parent_next_element',
110 'description_element': 'p',
111 } ),
112 'a': __.immut.Dictionary( {
113 'signature_source': 'parent_text',
114 'description_source': 'parent_next_element',
115 'description_element': 'p',
116 } ),
117 } ),
118 'cleanup_selectors': [ 'a.headerlink' ],
119} )
122async def extract_contents(
123 auxdata: __.ApplicationGlobals,
124 source: str,
125 objects: __.cabc.Sequence[ __.InventoryObject ], /, *,
126 theme: __.Absential[ str ] = __.absent,
127 include_snippets: bool = True,
128) -> list[ __.ContentDocument ]:
129 ''' Extracts documentation content for specified objects. '''
130 base_url = _urls.normalize_base_url( source )
131 if not objects: return [ ]
132 tasks = [
133 _extract_object_documentation(
134 auxdata, base_url, obj, include_snippets, theme )
135 for obj in objects ]
136 candidate_results = await __.asyncf.gather_async(
137 *tasks, return_exceptions = True )
138 results: list[ __.ContentDocument ] = [
139 result.value for result in candidate_results
140 if __.generics.is_value( result ) and result.value is not None ]
141 return results
144def parse_documentation_html(
145 content: str, element_id: str, url: str, *,
146 theme: __.Absential[ str ] = __.absent
147) -> __.cabc.Mapping[ str, str ]:
148 ''' Parses HTML content to extract documentation sections. '''
149 try: soup = _BeautifulSoup( content, 'lxml' )
150 except Exception as exc:
151 raise __.DocumentationParseFailure(
152 element_id, exc ) from exc
153 # Theme should be provided from detection metadata
154 # If absent, use None to fall back to generic detection
155 container = _find_main_content_container( soup, theme )
156 if __.is_absent( container ):
157 raise __.DocumentationContentAbsence( element_id )
158 element = container.find( id = element_id )
159 if not element:
160 raise __.DocumentationObjectAbsence( element_id, url )
161 signature, description = _extract_content_with_dsl(
162 element, element_id, theme )
163 return {
164 'signature': signature,
165 'description': description,
166 'object_name': element_id,
167 }
170def _cleanup_content(
171 content: str,
172 cleanup_selectors: __.cabc.Sequence[ str ]
173) -> str:
174 ''' Removes unwanted elements from content using CSS selectors. '''
175 # TODO: Implement CSS selector-based cleanup
176 return content
179def _extract_content_with_dsl(
180 element: __.typx.Any,
181 element_id: str,
182 theme: __.Absential[ str ] = __.absent
183) -> tuple[ str, str ]:
184 ''' Extracts content using DSL pattern configuration. '''
185 theme_name = theme if not __.is_absent( theme ) else None
186 if theme_name is not None:
187 pattern = THEME_EXTRACTION_PATTERNS.get( theme_name, _GENERIC_PATTERN )
188 else: pattern = _GENERIC_PATTERN
189 content_strategies = __.typx.cast(
190 __.cabc.Mapping[ str, __.cabc.Mapping[ str, __.typx.Any ] ],
191 pattern[ 'content_strategies' ] )
192 strategy = content_strategies.get( element.name )
193 if not strategy: return _generic_extraction( element )
194 signature = _extract_signature_with_strategy( element, strategy )
195 description = _extract_description_with_strategy( element, strategy )
196 if 'cleanup_selectors' in pattern:
197 cleanup_selectors = __.typx.cast(
198 __.cabc.Sequence[ str ], pattern[ 'cleanup_selectors' ] )
199 description = _cleanup_content( description, cleanup_selectors )
200 return signature, description
203def _extract_description_with_strategy(
204 element: __.typx.Any,
205 strategy: __.cabc.Mapping[ str, __.typx.Any ]
206) -> str:
207 ''' Extracts description using DSL strategy. '''
208 source_type = __.typx.cast( str, strategy[ 'description_source' ] )
209 element_type = __.typx.cast(
210 str, strategy.get( 'description_element', 'p' ) )
211 return _get_description_by_source_type(
212 element, source_type, element_type )
215async def _extract_object_documentation(
216 auxdata: __.ApplicationGlobals,
217 base_url: __.typx.Any,
218 obj: __.InventoryObject,
219 include_snippets: bool,
220 theme: __.Absential[ str ] = __.absent
221) -> __.ContentDocument | None:
222 ''' Extracts documentation for a single object. '''
223 from . import conversion as _conversion
224 doc_url = _urls.derive_documentation_url(
225 base_url, obj.uri, obj.name )
226 try:
227 html_content = (
228 await __.retrieve_url_as_text(
229 auxdata.content_cache, doc_url ) )
230 except Exception as exc:
231 _scribe.debug( "Failed to retrieve %s: %s", doc_url, exc )
232 return None
233 anchor = doc_url.fragment or str( obj.name )
234 try:
235 parsed_content = parse_documentation_html(
236 html_content, anchor, str( doc_url ), theme = theme )
237 except Exception: return None
238 description = _conversion.html_to_markdown(
239 parsed_content[ 'description' ] )
240 snippet_max_length = 200
241 if include_snippets:
242 content_snippet = (
243 description[ : snippet_max_length ] + '...'
244 if len( description ) > snippet_max_length
245 else description )
246 else: content_snippet = ''
247 return __.ContentDocument(
248 inventory_object = obj,
249 signature = parsed_content[ 'signature' ],
250 description = description,
251 content_snippet = content_snippet,
252 documentation_url = doc_url.geturl( ),
253 extraction_metadata = __.immut.Dictionary( {
254 'theme': theme if not __.is_absent( theme ) else 'unknown',
255 'extraction_method': 'sphinx_html_parsing',
256 'relevance_score': 1.0,
257 'match_reasons': [ 'direct extraction' ],
258 } )
259 )
262def _extract_signature_with_strategy(
263 element: __.typx.Any,
264 strategy: __.cabc.Mapping[ str, __.typx.Any ]
265) -> str:
266 ''' Extracts signature using DSL strategy. '''
267 source_type = __.typx.cast( str, strategy[ 'signature_source' ] )
268 match source_type:
269 case 'self': return _clean_extracted_text( element.get_text( ) )
270 case 'parent_text':
271 return (
272 _clean_extracted_text( element.parent.get_text( ) )
273 if element.parent else '' )
274 case 'parent_header':
275 if element.parent:
276 header = element.parent.find(
277 [ 'h1', 'h2', 'h3', 'h4', 'h5', 'h6' ] )
278 return (
279 _clean_extracted_text( header.get_text( ) )
280 if header else '' )
281 return ''
282 case 'first_header':
283 header = element.find(
284 [ 'h1', 'h2', 'h3', 'h4', 'h5', 'h6' ] )
285 return (
286 _clean_extracted_text( header.get_text( ) )
287 if header else '' )
288 case _: return _clean_extracted_text( element.get_text( ) )
291def _find_main_content_container(
292 soup: __.typx.Any, theme: __.Absential[ str ] = __.absent
293) -> __.Absential[ __.typx.Any ]:
294 ''' Finds the main content container using theme-specific strategies. '''
295 if theme == 'furo':
296 containers = [
297 soup.find( 'article', { 'role': 'main' } ),
298 soup.find( 'div', { 'id': 'furo-main-content' } ),
299 ]
300 elif theme == 'sphinx_rtd_theme':
301 containers = [
302 soup.find( 'div', { 'class': 'document' } ),
303 soup.find( 'div', { 'class': 'body' } ),
304 soup.find( 'div', { 'role': 'main' } ),
305 ]
306 elif theme == 'pydoctheme': # Python docs
307 containers = [
308 soup.find( 'div', { 'class': 'body' } ),
309 soup.find( 'div', { 'class': 'content' } ),
310 soup.body, # Python docs often use body directly
311 ]
312 elif theme == 'flask': # Flask docs
313 containers = [
314 soup.find( 'div', { 'class': 'body' } ),
315 soup.find( 'div', { 'class': 'content' } ),
316 soup.body,
317 ]
318 elif theme == 'alabaster':
319 containers = [
320 soup.find( 'div', { 'class': 'body' } ),
321 soup.find( 'div', { 'class': 'content' } ),
322 ]
323 else: # Generic fallback for unknown themes
324 containers = [
325 soup.find( 'article', { 'role': 'main' } ), # Furo theme
326 soup.find( 'div', { 'class': 'body' } ), # Basic theme
327 soup.find( 'div', { 'class': 'content' } ), # Nature theme
328 soup.find( 'div', { 'class': 'main' } ), # Generic main
329 soup.find( 'main' ), # HTML5 main element
330 soup.find( 'div', { 'role': 'main' } ), # Role-based
331 soup.body, # Fallback to body if nothing else works
332 ]
333 for container in containers:
334 if container: return container
335 return __.absent
338def _clean_extracted_text( text: str ) -> str:
339 ''' Cleans extracted text while preserving internal spacing. '''
340 # Remove leading/trailing whitespace but preserve internal spaces
341 text = text.strip( )
342 # Normalize multiple spaces to single spaces
343 text = __.re.sub( r' +', ' ', text )
344 # Remove excessive newlines but preserve paragraph breaks
345 return __.re.sub( r'\n\s*\n', '\n\n', text )
348def _generic_extraction( element: __.typx.Any ) -> tuple[ str, str ]:
349 ''' Generic fallback extraction for unknown element types. '''
350 signature = _clean_extracted_text( element.get_text( ) )
351 description = ''
352 if element.parent:
353 next_p = element.parent.find( 'p' )
354 if next_p:
355 description = str( next_p )
356 return signature, description
359def _get_description_by_source_type(
360 element: __.typx.Any,
361 source_type: str,
362 element_type: str
363) -> str:
364 ''' Gets description content based on source type. '''
365 match source_type:
366 case 'next_sibling':
367 return _get_sibling_text( element, element_type )
368 case 'parent_next_sibling':
369 return _get_parent_sibling_text( element, element_type )
370 case 'parent_next_element':
371 return _get_parent_element_text( element, element_type )
372 case 'parent_content':
373 return _get_parent_content_text( element, element_type )
374 case 'first_paragraph':
375 return _get_first_paragraph_text( element )
376 case _: return ''
379def _get_first_paragraph_text( element: __.typx.Any ) -> str:
380 ''' Gets HTML content from first paragraph within element. '''
381 paragraph = element.find( 'p' )
382 return str( paragraph ) if paragraph else ''
385def _get_parent_content_text( element: __.typx.Any, element_type: str ) -> str:
386 ''' Gets HTML content from content element within parent. '''
387 if element.parent:
388 content_elem = element.parent.find( element_type )
389 return content_elem.decode_contents( ) if content_elem else ''
390 return ''
393def _get_parent_element_text( element: __.typx.Any, element_type: str ) -> str:
394 ''' Gets HTML content from element within parent. '''
395 if element.parent:
396 next_elem = element.parent.find( element_type )
397 return next_elem.decode_contents( ) if next_elem else ''
398 return ''
401def _get_parent_sibling_text( element: __.typx.Any, element_type: str ) -> str:
402 ''' Gets HTML content from parent's next sibling element. '''
403 if element.parent:
404 sibling = element.parent.find_next_sibling( element_type )
405 return sibling.decode_contents( ) if sibling else ''
406 return ''
409def _get_sibling_text( element: __.typx.Any, element_type: str ) -> str:
410 ''' Gets HTML content from next sibling element. '''
411 sibling = element.find_next_sibling( element_type )
412 return sibling.decode_contents( ) if sibling else ''