Coverage for sources/librovore/structures/sphinx/extraction.py: 15%
118 statements
« prev ^ index » next coverage.py v7.10.7, created at 2025-09-28 22:09 +0000
« prev ^ index » next coverage.py v7.10.7, created at 2025-09-28 22:09 +0000
1# vim: set filetype=python fileencoding=utf-8:
2# -*- coding: utf-8 -*-
4#============================================================================#
5# #
6# Licensed under the Apache License, Version 2.0 (the "License"); #
7# you may not use this file except in compliance with the License. #
8# You may obtain a copy of the License at #
9# #
10# http://www.apache.org/licenses/LICENSE-2.0 #
11# #
12# Unless required by applicable law or agreed to in writing, software #
13# distributed under the License is distributed on an "AS IS" BASIS, #
14# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. #
15# See the License for the specific language governing permissions and #
16# limitations under the License. #
17# #
18#============================================================================#
21''' Documentation extraction and content retrieval. '''
24from bs4 import BeautifulSoup as _BeautifulSoup
26from . import __
27from . import urls as _urls
28from .patterns import THEME_PATTERNS as _THEME_PATTERNS
29from .patterns import UNIVERSAL_PATTERNS as _UNIVERSAL_PATTERNS
32_scribe = __.acquire_scribe( __name__ )
35async def extract_contents(
36 auxdata: __.ApplicationGlobals,
37 source: str,
38 objects: __.cabc.Sequence[ __.InventoryObject ], /, *,
39 theme: __.Absential[ str ] = __.absent,
40) -> list[ __.ContentDocument ]:
41 ''' Extracts documentation content for specified objects. '''
42 base_url = _urls.normalize_base_url( source )
43 if not objects: return [ ]
44 tasks = [
45 _extract_object_documentation(
46 auxdata, base_url, source, obj, theme )
47 for obj in objects ]
48 candidate_results = await __.asyncf.gather_async(
49 *tasks, return_exceptions = True )
50 results: list[ __.ContentDocument ] = [
51 result.value for result in candidate_results
52 if __.generics.is_value( result ) and result.value is not None ]
53 return results
56def parse_documentation_html(
57 content: str, element_id: str, url: str, *,
58 theme: __.Absential[ str ] = __.absent
59) -> __.cabc.Mapping[ str, str ]:
60 ''' Parses HTML content to extract documentation sections. '''
61 try: soup = _BeautifulSoup( content, 'lxml' )
62 except Exception as exc:
63 raise __.DocumentationParseFailure(
64 element_id, exc ) from exc
65 # Theme should be provided from detection metadata
66 # If absent, use None to fall back to generic detection
67 container = _find_main_content_container( soup, theme )
68 if __.is_absent( container ):
69 raise __.DocumentationContentAbsence( element_id )
70 element = container.find( id = element_id )
71 if not element:
72 raise __.DocumentationObjectAbsence( element_id, url )
73 description = _extract_content_with_dsl(
74 element, element_id, theme )
75 return {
76 'description': description,
77 'object_name': element_id,
78 }
81def _cleanup_content(
82 content: str,
83 cleanup_selectors: __.cabc.Sequence[ str ]
84) -> str:
85 ''' Removes unwanted elements from content using CSS selectors. '''
86 if not content.strip( ) or not cleanup_selectors:
87 return content
88 soup: __.typx.Any = _BeautifulSoup( content, 'lxml' )
89 for selector in cleanup_selectors:
90 for element in soup.select( selector ):
91 element.decompose( )
92 return str( soup )
95def _extract_content_with_dsl(
96 element: __.typx.Any,
97 element_id: str,
98 theme: __.Absential[ str ] = __.absent
99) -> str:
100 ''' Extracts content using universal pattern configuration. '''
101 if element.name == 'dt' and _is_api_signature( element ):
102 return _extract_api_signature_content( element )
103 description = _generic_extraction( element )
104 cleanup_selectors = _UNIVERSAL_PATTERNS[ 'navigation_cleanup' ][
105 'universal_selectors'
106 ]
107 return _cleanup_content( description, cleanup_selectors )
110def _extract_description_with_strategy(
111 element: __.typx.Any,
112 strategy: __.cabc.Mapping[ str, __.typx.Any ]
113) -> str:
114 ''' Extracts description using DSL strategy. '''
115 source_type = strategy[ 'description_source' ]
116 element_type = strategy.get( 'description_element', 'p' )
117 return _get_description_by_source_type(
118 element, source_type, element_type )
121async def _extract_object_documentation(
122 auxdata: __.ApplicationGlobals,
123 base_url: __.typx.Any,
124 location: str,
125 obj: __.InventoryObject,
126 theme: __.Absential[ str ] = __.absent,
127) -> __.ContentDocument | None:
128 ''' Extracts documentation for a single object. '''
129 from . import conversion as _conversion
130 doc_url = _urls.derive_documentation_url(
131 base_url, obj.uri, obj.name )
132 try:
133 html_content = (
134 await __.retrieve_url_as_text(
135 auxdata.content_cache, doc_url ) )
136 except Exception as exc:
137 _scribe.debug( "Failed to retrieve %s: %s", doc_url, exc )
138 return None
139 anchor = doc_url.fragment or str( obj.name )
140 try:
141 parsed_content = parse_documentation_html(
142 html_content, anchor, str( doc_url ), theme = theme )
143 except Exception: return None
144 description = _conversion.html_to_markdown(
145 parsed_content[ 'description' ] )
146 content_id = __.produce_content_id( location, obj.name )
147 return __.ContentDocument(
148 inventory_object = obj,
149 content_id = content_id,
150 description = description,
151 documentation_url = doc_url.geturl( ),
152 extraction_metadata = __.immut.Dictionary( {
153 'theme': theme if not __.is_absent( theme ) else 'unknown',
154 'extraction_method': 'sphinx_html_parsing',
155 'relevance_score': 1.0,
156 'match_reasons': [ 'direct extraction' ],
157 } )
158 )
163def _find_main_content_container(
164 soup: __.typx.Any, theme: __.Absential[ str ] = __.absent
165) -> __.Absential[ __.typx.Any ]:
166 ''' Finds main content container trying theme-specific patterns first. '''
167 if (
168 not __.is_absent( theme )
169 and theme in _THEME_PATTERNS[ 'content_containers' ]
170 ):
171 theme_selectors = _THEME_PATTERNS[ 'content_containers' ][
172 theme
173 ]
174 for selector in theme_selectors:
175 container = soup.select_one( selector )
176 if container: return container
177 content_config = _UNIVERSAL_PATTERNS[ 'content_containers' ]
178 universal_selectors = content_config[ 'universal_selectors' ]
179 for selector in universal_selectors:
180 container = soup.select_one( selector )
181 if container: return container
182 return __.absent
187def _generic_extraction( element: __.typx.Any ) -> str:
188 ''' Generic fallback extraction for unknown element types. '''
189 description = ''
190 if element.parent:
191 next_p = element.parent.find( 'p' )
192 if next_p:
193 description = str( next_p )
194 return description
197def _get_description_by_source_type(
198 element: __.typx.Any,
199 source_type: str,
200 element_type: str
201) -> str:
202 ''' Gets description content based on source type. '''
203 extractors = {
204 'next_sibling': lambda: _get_sibling_text( element, element_type ),
205 'parent_next_sibling': lambda: _get_parent_sibling_text(
206 element, element_type ),
207 'parent_next_element': lambda: _get_parent_element_text(
208 element, element_type ),
209 'parent_content': lambda: _get_parent_content_text(
210 element, element_type ),
211 'first_paragraph': lambda: _get_first_paragraph_text( element ),
212 'first_main_paragraph': lambda: _get_first_main_paragraph_text(
213 element ),
214 }
215 extractor = extractors.get( source_type )
216 return extractor( ) if extractor else ''
219def _get_first_paragraph_text( element: __.typx.Any ) -> str:
220 ''' Gets HTML content from first paragraph within element. '''
221 paragraph = element.find( 'p' )
222 return str( paragraph ) if paragraph else ''
225def _get_first_main_paragraph_text( element: __.typx.Any ) -> str:
226 ''' Gets HTML content from first paragraph, skipping sidebars. '''
227 for paragraph in element.find_all( 'p' ):
228 if paragraph.find_parent( [ 'aside', 'nav', 'header' ] ):
229 continue
230 return str( paragraph ) if paragraph else ''
231 return ''
234def _get_parent_content_text( element: __.typx.Any, element_type: str ) -> str:
235 ''' Gets HTML content from content element within parent. '''
236 if element.parent:
237 content_elem = element.parent.find( element_type )
238 return content_elem.decode_contents( ) if content_elem else ''
239 return ''
242def _get_parent_element_text( element: __.typx.Any, element_type: str ) -> str:
243 ''' Gets HTML content from element within parent. '''
244 if element.parent:
245 next_elem = element.parent.find( element_type )
246 return next_elem.decode_contents( ) if next_elem else ''
247 return ''
250def _get_parent_sibling_text( element: __.typx.Any, element_type: str ) -> str:
251 ''' Gets HTML content from parent's next sibling element. '''
252 if element.parent:
253 sibling = element.parent.find_next_sibling( element_type )
254 return sibling.decode_contents( ) if sibling else ''
255 return ''
258def _get_sibling_text( element: __.typx.Any, element_type: str ) -> str:
259 ''' Gets HTML content from next sibling element. '''
260 sibling = element.find_next_sibling( element_type )
261 return sibling.decode_contents( ) if sibling else ''
264def _is_api_signature( element: __.typx.Any ) -> bool:
265 ''' Determines if element is an API signature using universal patterns. '''
266 signature_config = _UNIVERSAL_PATTERNS[ 'api_signatures' ]
267 signature_classes = signature_config[ 'signature_classes' ]
268 element_classes = element.get( 'class', [ ] )
269 return any( cls in element_classes for cls in signature_classes )
272def _extract_api_signature_content( element: __.typx.Any ) -> str:
273 ''' Extracts API signature content using universal patterns. '''
274 signature_config = _UNIVERSAL_PATTERNS[ 'api_signatures' ]
275 description_selector = signature_config[ 'description_selector' ]
276 sibling = element.find_next_sibling( description_selector )
277 return sibling.decode_contents( ) if sibling else ''