Coverage for sources/librovore/structures/sphinx/extraction.py: 12%
114 statements
« prev ^ index » next coverage.py v7.10.6, created at 2025-09-02 00:02 +0000
« prev ^ index » next coverage.py v7.10.6, created at 2025-09-02 00:02 +0000
1# vim: set filetype=python fileencoding=utf-8:
2# -*- coding: utf-8 -*-
4#============================================================================#
5# #
6# Licensed under the Apache License, Version 2.0 (the "License"); #
7# you may not use this file except in compliance with the License. #
8# You may obtain a copy of the License at #
9# #
10# http://www.apache.org/licenses/LICENSE-2.0 #
11# #
12# Unless required by applicable law or agreed to in writing, software #
13# distributed under the License is distributed on an "AS IS" BASIS, #
14# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. #
15# See the License for the specific language governing permissions and #
16# limitations under the License. #
17# #
18#============================================================================#
21''' Documentation extraction and content retrieval. '''
24from bs4 import BeautifulSoup as _BeautifulSoup
26from . import __
27from . import urls as _urls
30_scribe = __.acquire_scribe( __name__ )
33# Theme-specific content extraction patterns
34THEME_EXTRACTION_PATTERNS: __.cabc.Mapping[
35 str, __.cabc.Mapping[ str, __.typx.Any ]
36] = __.immut.Dictionary( {
37 'pydoctheme': __.immut.Dictionary( {
38 'anchor_elements': [ 'dt', 'a' ],
39 'content_strategies': __.immut.Dictionary( {
40 'dt': __.immut.Dictionary( {
41 'description_source': 'next_sibling',
42 'description_element': 'dd',
43 } ),
44 'a': __.immut.Dictionary( {
45 'description_source': 'parent_next_sibling',
46 'description_element': 'dd',
47 } ),
48 } ),
49 'cleanup_selectors': [ 'a.headerlink' ],
50 } ),
51 'furo': __.immut.Dictionary( {
52 'anchor_elements': [ 'span', 'a', 'dt' ],
53 'content_strategies': __.immut.Dictionary( {
54 'span': __.immut.Dictionary( {
55 'description_source': 'parent_next_element',
56 'description_element': 'p',
57 'fallback_container': 'section',
58 } ),
59 'a': __.immut.Dictionary( {
60 'description_source': 'parent_next_element',
61 'description_element': 'p',
62 } ),
63 'dt': __.immut.Dictionary( {
64 'description_source': 'next_sibling',
65 'description_element': 'dd',
66 } ),
67 } ),
68 'cleanup_selectors': [ 'a.headerlink', '.highlight' ],
69 } ),
70 'sphinx_rtd_theme': __.immut.Dictionary( {
71 'anchor_elements': [ 'dt', 'span', 'a' ],
72 'content_strategies': __.immut.Dictionary( {
73 'dt': __.immut.Dictionary( {
74 'description_source': 'next_sibling',
75 'description_element': 'dd',
76 } ),
77 'span': __.immut.Dictionary( {
78 'description_source': 'parent_content',
79 'description_element': 'p',
80 } ),
81 } ),
82 'cleanup_selectors': [ 'a.headerlink' ],
83 } ),
84} )
86# Generic fallback pattern for unknown themes
87_GENERIC_PATTERN = __.immut.Dictionary( {
88 'anchor_elements': [ 'dt', 'span', 'a', 'section', 'div' ],
89 'content_strategies': __.immut.Dictionary( {
90 'dt': __.immut.Dictionary( {
91 'description_source': 'next_sibling',
92 'description_element': 'dd',
93 } ),
94 'section': __.immut.Dictionary( {
95 'description_source': 'first_paragraph',
96 'description_element': 'p',
97 } ),
98 'span': __.immut.Dictionary( {
99 'description_source': 'parent_next_element',
100 'description_element': 'p',
101 } ),
102 'a': __.immut.Dictionary( {
103 'description_source': 'parent_next_element',
104 'description_element': 'p',
105 } ),
106 } ),
107 'cleanup_selectors': [ 'a.headerlink' ],
108} )
111async def extract_contents(
112 auxdata: __.ApplicationGlobals,
113 source: str,
114 objects: __.cabc.Sequence[ __.InventoryObject ], /, *,
115 theme: __.Absential[ str ] = __.absent,
116) -> list[ __.ContentDocument ]:
117 ''' Extracts documentation content for specified objects. '''
118 base_url = _urls.normalize_base_url( source )
119 if not objects: return [ ]
120 tasks = [
121 _extract_object_documentation(
122 auxdata, base_url, obj, theme )
123 for obj in objects ]
124 candidate_results = await __.asyncf.gather_async(
125 *tasks, return_exceptions = True )
126 results: list[ __.ContentDocument ] = [
127 result.value for result in candidate_results
128 if __.generics.is_value( result ) and result.value is not None ]
129 return results
132def parse_documentation_html(
133 content: str, element_id: str, url: str, *,
134 theme: __.Absential[ str ] = __.absent
135) -> __.cabc.Mapping[ str, str ]:
136 ''' Parses HTML content to extract documentation sections. '''
137 try: soup = _BeautifulSoup( content, 'lxml' )
138 except Exception as exc:
139 raise __.DocumentationParseFailure(
140 element_id, exc ) from exc
141 # Theme should be provided from detection metadata
142 # If absent, use None to fall back to generic detection
143 container = _find_main_content_container( soup, theme )
144 if __.is_absent( container ):
145 raise __.DocumentationContentAbsence( element_id )
146 element = container.find( id = element_id )
147 if not element:
148 raise __.DocumentationObjectAbsence( element_id, url )
149 description = _extract_content_with_dsl(
150 element, element_id, theme )
151 return {
152 'description': description,
153 'object_name': element_id,
154 }
157def _cleanup_content(
158 content: str,
159 cleanup_selectors: __.cabc.Sequence[ str ]
160) -> str:
161 ''' Removes unwanted elements from content using CSS selectors. '''
162 # TODO: Implement CSS selector-based cleanup
163 return content
166def _extract_content_with_dsl(
167 element: __.typx.Any,
168 element_id: str,
169 theme: __.Absential[ str ] = __.absent
170) -> str:
171 ''' Extracts content using DSL pattern configuration. '''
172 theme_name = theme if not __.is_absent( theme ) else None
173 if theme_name is not None:
174 pattern = THEME_EXTRACTION_PATTERNS.get( theme_name, _GENERIC_PATTERN )
175 else: pattern = _GENERIC_PATTERN
176 content_strategies = __.typx.cast(
177 __.cabc.Mapping[ str, __.cabc.Mapping[ str, __.typx.Any ] ],
178 pattern[ 'content_strategies' ] )
179 strategy = content_strategies.get( element.name )
180 if not strategy: return _generic_extraction( element )
181 description = _extract_description_with_strategy( element, strategy )
182 if 'cleanup_selectors' in pattern:
183 cleanup_selectors = __.typx.cast(
184 __.cabc.Sequence[ str ], pattern[ 'cleanup_selectors' ] )
185 description = _cleanup_content( description, cleanup_selectors )
186 return description
189def _extract_description_with_strategy(
190 element: __.typx.Any,
191 strategy: __.cabc.Mapping[ str, __.typx.Any ]
192) -> str:
193 ''' Extracts description using DSL strategy. '''
194 source_type = __.typx.cast( str, strategy[ 'description_source' ] )
195 element_type = __.typx.cast(
196 str, strategy.get( 'description_element', 'p' ) )
197 return _get_description_by_source_type(
198 element, source_type, element_type )
201async def _extract_object_documentation(
202 auxdata: __.ApplicationGlobals,
203 base_url: __.typx.Any,
204 obj: __.InventoryObject,
205 theme: __.Absential[ str ] = __.absent,
206) -> __.ContentDocument | None:
207 ''' Extracts documentation for a single object. '''
208 from . import conversion as _conversion
209 doc_url = _urls.derive_documentation_url(
210 base_url, obj.uri, obj.name )
211 try:
212 html_content = (
213 await __.retrieve_url_as_text(
214 auxdata.content_cache, doc_url ) )
215 except Exception as exc:
216 _scribe.debug( "Failed to retrieve %s: %s", doc_url, exc )
217 return None
218 anchor = doc_url.fragment or str( obj.name )
219 try:
220 parsed_content = parse_documentation_html(
221 html_content, anchor, str( doc_url ), theme = theme )
222 except Exception: return None
223 description = _conversion.html_to_markdown(
224 parsed_content[ 'description' ] )
225 return __.ContentDocument(
226 inventory_object = obj,
227 description = description,
228 documentation_url = doc_url.geturl( ),
229 extraction_metadata = __.immut.Dictionary( {
230 'theme': theme if not __.is_absent( theme ) else 'unknown',
231 'extraction_method': 'sphinx_html_parsing',
232 'relevance_score': 1.0,
233 'match_reasons': [ 'direct extraction' ],
234 } )
235 )
240def _find_main_content_container(
241 soup: __.typx.Any, theme: __.Absential[ str ] = __.absent
242) -> __.Absential[ __.typx.Any ]:
243 ''' Finds the main content container using theme-specific strategies. '''
244 if theme == 'furo':
245 containers = [
246 soup.find( 'article', { 'role': 'main' } ),
247 soup.find( 'div', { 'id': 'furo-main-content' } ),
248 ]
249 elif theme == 'sphinx_rtd_theme':
250 containers = [
251 soup.find( 'div', { 'class': 'document' } ),
252 soup.find( 'div', { 'class': 'body' } ),
253 soup.find( 'div', { 'role': 'main' } ),
254 ]
255 elif theme == 'pydoctheme': # Python docs
256 containers = [
257 soup.find( 'div', { 'class': 'body' } ),
258 soup.find( 'div', { 'class': 'content' } ),
259 soup.body, # Python docs often use body directly
260 ]
261 elif theme == 'flask': # Flask docs
262 containers = [
263 soup.find( 'div', { 'class': 'body' } ),
264 soup.find( 'div', { 'class': 'content' } ),
265 soup.body,
266 ]
267 elif theme == 'alabaster':
268 containers = [
269 soup.find( 'div', { 'class': 'body' } ),
270 soup.find( 'div', { 'class': 'content' } ),
271 ]
272 else: # Generic fallback for unknown themes
273 containers = [
274 soup.find( 'article', { 'role': 'main' } ), # Furo theme
275 soup.find( 'div', { 'class': 'body' } ), # Basic theme
276 soup.find( 'div', { 'class': 'content' } ), # Nature theme
277 soup.find( 'div', { 'class': 'main' } ), # Generic main
278 soup.find( 'main' ), # HTML5 main element
279 soup.find( 'div', { 'role': 'main' } ), # Role-based
280 soup.body, # Fallback to body if nothing else works
281 ]
282 for container in containers:
283 if container: return container
284 return __.absent
289def _generic_extraction( element: __.typx.Any ) -> str:
290 ''' Generic fallback extraction for unknown element types. '''
291 description = ''
292 if element.parent:
293 next_p = element.parent.find( 'p' )
294 if next_p:
295 description = str( next_p )
296 return description
299def _get_description_by_source_type(
300 element: __.typx.Any,
301 source_type: str,
302 element_type: str
303) -> str:
304 ''' Gets description content based on source type. '''
305 match source_type:
306 case 'next_sibling':
307 return _get_sibling_text( element, element_type )
308 case 'parent_next_sibling':
309 return _get_parent_sibling_text( element, element_type )
310 case 'parent_next_element':
311 return _get_parent_element_text( element, element_type )
312 case 'parent_content':
313 return _get_parent_content_text( element, element_type )
314 case 'first_paragraph':
315 return _get_first_paragraph_text( element )
316 case _: return ''
319def _get_first_paragraph_text( element: __.typx.Any ) -> str:
320 ''' Gets HTML content from first paragraph within element. '''
321 paragraph = element.find( 'p' )
322 return str( paragraph ) if paragraph else ''
325def _get_parent_content_text( element: __.typx.Any, element_type: str ) -> str:
326 ''' Gets HTML content from content element within parent. '''
327 if element.parent:
328 content_elem = element.parent.find( element_type )
329 return content_elem.decode_contents( ) if content_elem else ''
330 return ''
333def _get_parent_element_text( element: __.typx.Any, element_type: str ) -> str:
334 ''' Gets HTML content from element within parent. '''
335 if element.parent:
336 next_elem = element.parent.find( element_type )
337 return next_elem.decode_contents( ) if next_elem else ''
338 return ''
341def _get_parent_sibling_text( element: __.typx.Any, element_type: str ) -> str:
342 ''' Gets HTML content from parent's next sibling element. '''
343 if element.parent:
344 sibling = element.parent.find_next_sibling( element_type )
345 return sibling.decode_contents( ) if sibling else ''
346 return ''
349def _get_sibling_text( element: __.typx.Any, element_type: str ) -> str:
350 ''' Gets HTML content from next sibling element. '''
351 sibling = element.find_next_sibling( element_type )
352 return sibling.decode_contents( ) if sibling else ''