Coverage for sources/librovore/structures/sphinx/extraction.py: 12%
115 statements
« prev ^ index » next coverage.py v7.10.6, created at 2025-09-06 02:25 +0000
« prev ^ index » next coverage.py v7.10.6, created at 2025-09-06 02:25 +0000
1# vim: set filetype=python fileencoding=utf-8:
2# -*- coding: utf-8 -*-
4#============================================================================#
5# #
6# Licensed under the Apache License, Version 2.0 (the "License"); #
7# you may not use this file except in compliance with the License. #
8# You may obtain a copy of the License at #
9# #
10# http://www.apache.org/licenses/LICENSE-2.0 #
11# #
12# Unless required by applicable law or agreed to in writing, software #
13# distributed under the License is distributed on an "AS IS" BASIS, #
14# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. #
15# See the License for the specific language governing permissions and #
16# limitations under the License. #
17# #
18#============================================================================#
21''' Documentation extraction and content retrieval. '''
24from bs4 import BeautifulSoup as _BeautifulSoup
26from . import __
27from . import urls as _urls
30_scribe = __.acquire_scribe( __name__ )
33# Theme-specific content extraction patterns
34THEME_EXTRACTION_PATTERNS: __.cabc.Mapping[
35 str, __.cabc.Mapping[ str, __.typx.Any ]
36] = __.immut.Dictionary( {
37 'pydoctheme': __.immut.Dictionary( {
38 'anchor_elements': [ 'dt', 'a' ],
39 'content_strategies': __.immut.Dictionary( {
40 'dt': __.immut.Dictionary( {
41 'description_source': 'next_sibling',
42 'description_element': 'dd',
43 } ),
44 'a': __.immut.Dictionary( {
45 'description_source': 'parent_next_sibling',
46 'description_element': 'dd',
47 } ),
48 } ),
49 'cleanup_selectors': [ 'a.headerlink' ],
50 } ),
51 'furo': __.immut.Dictionary( {
52 'anchor_elements': [ 'span', 'a', 'dt' ],
53 'content_strategies': __.immut.Dictionary( {
54 'span': __.immut.Dictionary( {
55 'description_source': 'parent_next_element',
56 'description_element': 'p',
57 'fallback_container': 'section',
58 } ),
59 'a': __.immut.Dictionary( {
60 'description_source': 'parent_next_element',
61 'description_element': 'p',
62 } ),
63 'dt': __.immut.Dictionary( {
64 'description_source': 'next_sibling',
65 'description_element': 'dd',
66 } ),
67 } ),
68 'cleanup_selectors': [ 'a.headerlink', '.highlight' ],
69 } ),
70 'sphinx_rtd_theme': __.immut.Dictionary( {
71 'anchor_elements': [ 'dt', 'span', 'a' ],
72 'content_strategies': __.immut.Dictionary( {
73 'dt': __.immut.Dictionary( {
74 'description_source': 'next_sibling',
75 'description_element': 'dd',
76 } ),
77 'span': __.immut.Dictionary( {
78 'description_source': 'parent_content',
79 'description_element': 'p',
80 } ),
81 } ),
82 'cleanup_selectors': [ 'a.headerlink' ],
83 } ),
84} )
86# Generic fallback pattern for unknown themes
87_GENERIC_PATTERN = __.immut.Dictionary( {
88 'anchor_elements': [ 'dt', 'span', 'a', 'section', 'div' ],
89 'content_strategies': __.immut.Dictionary( {
90 'dt': __.immut.Dictionary( {
91 'description_source': 'next_sibling',
92 'description_element': 'dd',
93 } ),
94 'section': __.immut.Dictionary( {
95 'description_source': 'first_paragraph',
96 'description_element': 'p',
97 } ),
98 'span': __.immut.Dictionary( {
99 'description_source': 'parent_next_element',
100 'description_element': 'p',
101 } ),
102 'a': __.immut.Dictionary( {
103 'description_source': 'parent_next_element',
104 'description_element': 'p',
105 } ),
106 } ),
107 'cleanup_selectors': [ 'a.headerlink' ],
108} )
111async def extract_contents(
112 auxdata: __.ApplicationGlobals,
113 source: str,
114 objects: __.cabc.Sequence[ __.InventoryObject ], /, *,
115 theme: __.Absential[ str ] = __.absent,
116) -> list[ __.ContentDocument ]:
117 ''' Extracts documentation content for specified objects. '''
118 base_url = _urls.normalize_base_url( source )
119 if not objects: return [ ]
120 tasks = [
121 _extract_object_documentation(
122 auxdata, base_url, source, obj, theme )
123 for obj in objects ]
124 candidate_results = await __.asyncf.gather_async(
125 *tasks, return_exceptions = True )
126 results: list[ __.ContentDocument ] = [
127 result.value for result in candidate_results
128 if __.generics.is_value( result ) and result.value is not None ]
129 return results
132def parse_documentation_html(
133 content: str, element_id: str, url: str, *,
134 theme: __.Absential[ str ] = __.absent
135) -> __.cabc.Mapping[ str, str ]:
136 ''' Parses HTML content to extract documentation sections. '''
137 try: soup = _BeautifulSoup( content, 'lxml' )
138 except Exception as exc:
139 raise __.DocumentationParseFailure(
140 element_id, exc ) from exc
141 # Theme should be provided from detection metadata
142 # If absent, use None to fall back to generic detection
143 container = _find_main_content_container( soup, theme )
144 if __.is_absent( container ):
145 raise __.DocumentationContentAbsence( element_id )
146 element = container.find( id = element_id )
147 if not element:
148 raise __.DocumentationObjectAbsence( element_id, url )
149 description = _extract_content_with_dsl(
150 element, element_id, theme )
151 return {
152 'description': description,
153 'object_name': element_id,
154 }
157def _cleanup_content(
158 content: str,
159 cleanup_selectors: __.cabc.Sequence[ str ]
160) -> str:
161 ''' Removes unwanted elements from content using CSS selectors. '''
162 # TODO: Implement CSS selector-based cleanup
163 return content
166def _extract_content_with_dsl(
167 element: __.typx.Any,
168 element_id: str,
169 theme: __.Absential[ str ] = __.absent
170) -> str:
171 ''' Extracts content using DSL pattern configuration. '''
172 theme_name = theme if not __.is_absent( theme ) else None
173 if theme_name is not None:
174 pattern = THEME_EXTRACTION_PATTERNS.get( theme_name, _GENERIC_PATTERN )
175 else: pattern = _GENERIC_PATTERN
176 content_strategies = __.typx.cast(
177 __.cabc.Mapping[ str, __.cabc.Mapping[ str, __.typx.Any ] ],
178 pattern[ 'content_strategies' ] )
179 strategy = content_strategies.get( element.name )
180 if not strategy: return _generic_extraction( element )
181 description = _extract_description_with_strategy( element, strategy )
182 if 'cleanup_selectors' in pattern:
183 cleanup_selectors = __.typx.cast(
184 __.cabc.Sequence[ str ], pattern[ 'cleanup_selectors' ] )
185 description = _cleanup_content( description, cleanup_selectors )
186 return description
189def _extract_description_with_strategy(
190 element: __.typx.Any,
191 strategy: __.cabc.Mapping[ str, __.typx.Any ]
192) -> str:
193 ''' Extracts description using DSL strategy. '''
194 source_type = __.typx.cast( str, strategy[ 'description_source' ] )
195 element_type = __.typx.cast(
196 str, strategy.get( 'description_element', 'p' ) )
197 return _get_description_by_source_type(
198 element, source_type, element_type )
201async def _extract_object_documentation(
202 auxdata: __.ApplicationGlobals,
203 base_url: __.typx.Any,
204 location: str,
205 obj: __.InventoryObject,
206 theme: __.Absential[ str ] = __.absent,
207) -> __.ContentDocument | None:
208 ''' Extracts documentation for a single object. '''
209 from . import conversion as _conversion
210 doc_url = _urls.derive_documentation_url(
211 base_url, obj.uri, obj.name )
212 try:
213 html_content = (
214 await __.retrieve_url_as_text(
215 auxdata.content_cache, doc_url ) )
216 except Exception as exc:
217 _scribe.debug( "Failed to retrieve %s: %s", doc_url, exc )
218 return None
219 anchor = doc_url.fragment or str( obj.name )
220 try:
221 parsed_content = parse_documentation_html(
222 html_content, anchor, str( doc_url ), theme = theme )
223 except Exception: return None
224 description = _conversion.html_to_markdown(
225 parsed_content[ 'description' ] )
226 content_id = __.produce_content_id( location, obj.name )
227 return __.ContentDocument(
228 inventory_object = obj,
229 content_id = content_id,
230 description = description,
231 documentation_url = doc_url.geturl( ),
232 extraction_metadata = __.immut.Dictionary( {
233 'theme': theme if not __.is_absent( theme ) else 'unknown',
234 'extraction_method': 'sphinx_html_parsing',
235 'relevance_score': 1.0,
236 'match_reasons': [ 'direct extraction' ],
237 } )
238 )
243def _find_main_content_container(
244 soup: __.typx.Any, theme: __.Absential[ str ] = __.absent
245) -> __.Absential[ __.typx.Any ]:
246 ''' Finds the main content container using theme-specific strategies. '''
247 if theme == 'furo':
248 containers = [
249 soup.find( 'article', { 'role': 'main' } ),
250 soup.find( 'div', { 'id': 'furo-main-content' } ),
251 ]
252 elif theme == 'sphinx_rtd_theme':
253 containers = [
254 soup.find( 'div', { 'class': 'document' } ),
255 soup.find( 'div', { 'class': 'body' } ),
256 soup.find( 'div', { 'role': 'main' } ),
257 ]
258 elif theme == 'pydoctheme': # Python docs
259 containers = [
260 soup.find( 'div', { 'class': 'body' } ),
261 soup.find( 'div', { 'class': 'content' } ),
262 soup.body, # Python docs often use body directly
263 ]
264 elif theme == 'flask': # Flask docs
265 containers = [
266 soup.find( 'div', { 'class': 'body' } ),
267 soup.find( 'div', { 'class': 'content' } ),
268 soup.body,
269 ]
270 elif theme == 'alabaster':
271 containers = [
272 soup.find( 'div', { 'class': 'body' } ),
273 soup.find( 'div', { 'class': 'content' } ),
274 ]
275 else: # Generic fallback for unknown themes
276 containers = [
277 soup.find( 'article', { 'role': 'main' } ), # Furo theme
278 soup.find( 'div', { 'class': 'body' } ), # Basic theme
279 soup.find( 'div', { 'class': 'content' } ), # Nature theme
280 soup.find( 'div', { 'class': 'main' } ), # Generic main
281 soup.find( 'main' ), # HTML5 main element
282 soup.find( 'div', { 'role': 'main' } ), # Role-based
283 soup.body, # Fallback to body if nothing else works
284 ]
285 for container in containers:
286 if container: return container
287 return __.absent
292def _generic_extraction( element: __.typx.Any ) -> str:
293 ''' Generic fallback extraction for unknown element types. '''
294 description = ''
295 if element.parent:
296 next_p = element.parent.find( 'p' )
297 if next_p:
298 description = str( next_p )
299 return description
302def _get_description_by_source_type(
303 element: __.typx.Any,
304 source_type: str,
305 element_type: str
306) -> str:
307 ''' Gets description content based on source type. '''
308 match source_type:
309 case 'next_sibling':
310 return _get_sibling_text( element, element_type )
311 case 'parent_next_sibling':
312 return _get_parent_sibling_text( element, element_type )
313 case 'parent_next_element':
314 return _get_parent_element_text( element, element_type )
315 case 'parent_content':
316 return _get_parent_content_text( element, element_type )
317 case 'first_paragraph':
318 return _get_first_paragraph_text( element )
319 case _: return ''
322def _get_first_paragraph_text( element: __.typx.Any ) -> str:
323 ''' Gets HTML content from first paragraph within element. '''
324 paragraph = element.find( 'p' )
325 return str( paragraph ) if paragraph else ''
328def _get_parent_content_text( element: __.typx.Any, element_type: str ) -> str:
329 ''' Gets HTML content from content element within parent. '''
330 if element.parent:
331 content_elem = element.parent.find( element_type )
332 return content_elem.decode_contents( ) if content_elem else ''
333 return ''
336def _get_parent_element_text( element: __.typx.Any, element_type: str ) -> str:
337 ''' Gets HTML content from element within parent. '''
338 if element.parent:
339 next_elem = element.parent.find( element_type )
340 return next_elem.decode_contents( ) if next_elem else ''
341 return ''
344def _get_parent_sibling_text( element: __.typx.Any, element_type: str ) -> str:
345 ''' Gets HTML content from parent's next sibling element. '''
346 if element.parent:
347 sibling = element.parent.find_next_sibling( element_type )
348 return sibling.decode_contents( ) if sibling else ''
349 return ''
352def _get_sibling_text( element: __.typx.Any, element_type: str ) -> str:
353 ''' Gets HTML content from next sibling element. '''
354 sibling = element.find_next_sibling( element_type )
355 return sibling.decode_contents( ) if sibling else ''