Coverage for sources/librovore/structures/sphinx/extraction.py: 15%

1# vim: set filetype=python fileencoding=utf-8:

2# -*- coding: utf-8 -*-

4#============================================================================#

5# #

6# Licensed under the Apache License, Version 2.0 (the "License"); #

7# you may not use this file except in compliance with the License. #

8# You may obtain a copy of the License at #

9# #

10# http://www.apache.org/licenses/LICENSE-2.0 #

11# #

12# Unless required by applicable law or agreed to in writing, software #

13# distributed under the License is distributed on an "AS IS" BASIS, #

14# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. #

15# See the License for the specific language governing permissions and #

16# limitations under the License. #

17# #

18#============================================================================#

21''' Documentation extraction and content retrieval. '''

24from bs4 import BeautifulSoup as _BeautifulSoup

26from . import __

27from . import urls as _urls

28from .patterns import THEME_PATTERNS as _THEME_PATTERNS

29from .patterns import UNIVERSAL_PATTERNS as _UNIVERSAL_PATTERNS

32_scribe = __.acquire_scribe( __name__ )

35async def extract_contents(

36 auxdata: __.ApplicationGlobals,

37 source: str,

38 objects: __.cabc.Sequence[ __.InventoryObject ], /, *,

39 theme: __.Absential[ str ] = __.absent,

40) -> list[ __.ContentDocument ]:

41 ''' Extracts documentation content for specified objects. '''

42 base_url = _urls.normalize_base_url( source )

43 if not objects: return [ ]

44 tasks = [

45 _extract_object_documentation(

46 auxdata, base_url, source, obj, theme )

47 for obj in objects ]

48 candidate_results = await __.asyncf.gather_async(

49 *tasks, return_exceptions = True )

50 results: list[ __.ContentDocument ] = [

51 result.value for result in candidate_results

52 if __.generics.is_value( result ) and result.value is not None ]

53 return results

56def parse_documentation_html(

57 content: str, element_id: str, url: str, *,

58 theme: __.Absential[ str ] = __.absent

59) -> __.cabc.Mapping[ str, str ]:

60 ''' Parses HTML content to extract documentation sections. '''

61 try: soup = _BeautifulSoup( content, 'lxml' )

62 except Exception as exc:

63 raise __.DocumentationParseFailure(

64 element_id, exc ) from exc

65 # Theme should be provided from detection metadata

66 # If absent, use None to fall back to generic detection

67 container = _find_main_content_container( soup, theme )

68 if __.is_absent( container ):

69 raise __.DocumentationContentAbsence( element_id )

70 element = container.find( id = element_id )

71 if not element:

72 raise __.DocumentationObjectAbsence( element_id, url )

73 description = _extract_content_with_dsl(

74 element, element_id, theme )

75 return {

76 'description': description,

77 'object_name': element_id,

78 }

81def _cleanup_content(

82 content: str,

83 cleanup_selectors: __.cabc.Sequence[ str ]

84) -> str:

85 ''' Removes unwanted elements from content using CSS selectors. '''

86 if not content.strip( ) or not cleanup_selectors:

87 return content

88 soup: __.typx.Any = _BeautifulSoup( content, 'lxml' )

89 for selector in cleanup_selectors:

90 for element in soup.select( selector ):

91 element.decompose( )

92 return str( soup )

95def _extract_content_with_dsl(

96 element: __.typx.Any,

97 element_id: str,

98 theme: __.Absential[ str ] = __.absent

99) -> str:

100 ''' Extracts content using universal pattern configuration. '''

101 if element.name == 'dt' and _is_api_signature( element ):

102 return _extract_api_signature_content( element )

103 description = _generic_extraction( element )

104 cleanup_selectors = _UNIVERSAL_PATTERNS[ 'navigation_cleanup' ][

105 'universal_selectors'

106 ]

107 return _cleanup_content( description, cleanup_selectors )

108

109

110def _extract_description_with_strategy(

111 element: __.typx.Any,

112 strategy: __.cabc.Mapping[ str, __.typx.Any ]

113) -> str:

114 ''' Extracts description using DSL strategy. '''

115 source_type = strategy[ 'description_source' ]

116 element_type = strategy.get( 'description_element', 'p' )

117 return _get_description_by_source_type(

118 element, source_type, element_type )

119

120

121async def _extract_object_documentation(

122 auxdata: __.ApplicationGlobals,

123 base_url: __.typx.Any,

124 location: str,

125 obj: __.InventoryObject,

126 theme: __.Absential[ str ] = __.absent,

127) -> __.ContentDocument | None:

128 ''' Extracts documentation for a single object. '''

129 from . import conversion as _conversion

130 doc_url = _urls.derive_documentation_url(

131 base_url, obj.uri, obj.name )

132 try:

133 html_content = (

134 await __.retrieve_url_as_text(

135 auxdata.content_cache, doc_url ) )

136 except Exception as exc:

137 _scribe.debug( "Failed to retrieve %s: %s", doc_url, exc )

138 return None

139 anchor = doc_url.fragment or str( obj.name )

140 try:

141 parsed_content = parse_documentation_html(

142 html_content, anchor, str( doc_url ), theme = theme )

143 except Exception: return None

144 description = _conversion.html_to_markdown(

145 parsed_content[ 'description' ] )

146 content_id = __.produce_content_id( location, obj.name )

147 return __.ContentDocument(

148 inventory_object = obj,

149 content_id = content_id,

150 description = description,

151 documentation_url = doc_url.geturl( ),

152 extraction_metadata = __.immut.Dictionary( {

153 'theme': theme if not __.is_absent( theme ) else 'unknown',

154 'extraction_method': 'sphinx_html_parsing',

155 'relevance_score': 1.0,

156 'match_reasons': [ 'direct extraction' ],

157 } )

158 )

163def _find_main_content_container(

164 soup: __.typx.Any, theme: __.Absential[ str ] = __.absent

165) -> __.Absential[ __.typx.Any ]:

166 ''' Finds main content container trying theme-specific patterns first. '''

167 if (

168 not __.is_absent( theme )

169 and theme in _THEME_PATTERNS[ 'content_containers' ]

170 ):

171 theme_selectors = _THEME_PATTERNS[ 'content_containers' ][

172 theme

173 ]

174 for selector in theme_selectors:

175 container = soup.select_one( selector )

176 if container: return container

177 content_config = _UNIVERSAL_PATTERNS[ 'content_containers' ]

178 universal_selectors = content_config[ 'universal_selectors' ]

179 for selector in universal_selectors:

180 container = soup.select_one( selector )

181 if container: return container

182 return __.absent

187def _generic_extraction( element: __.typx.Any ) -> str:

188 ''' Generic fallback extraction for unknown element types. '''

189 description = ''

190 if element.parent:

191 next_p = element.parent.find( 'p' )

192 if next_p:

193 description = str( next_p )

194 return description

195

196

197def _get_description_by_source_type(

198 element: __.typx.Any,

199 source_type: str,

200 element_type: str

201) -> str:

202 ''' Gets description content based on source type. '''

203 extractors = {

204 'next_sibling': lambda: _get_sibling_text( element, element_type ),

205 'parent_next_sibling': lambda: _get_parent_sibling_text(

206 element, element_type ),

207 'parent_next_element': lambda: _get_parent_element_text(

208 element, element_type ),

209 'parent_content': lambda: _get_parent_content_text(

210 element, element_type ),

211 'first_paragraph': lambda: _get_first_paragraph_text( element ),

212 'first_main_paragraph': lambda: _get_first_main_paragraph_text(

213 element ),

214 }

215 extractor = extractors.get( source_type )

216 return extractor( ) if extractor else ''

217

218

219def _get_first_paragraph_text( element: __.typx.Any ) -> str:

220 ''' Gets HTML content from first paragraph within element. '''

221 paragraph = element.find( 'p' )

222 return str( paragraph ) if paragraph else ''

223

224

225def _get_first_main_paragraph_text( element: __.typx.Any ) -> str:

226 ''' Gets HTML content from first paragraph, skipping sidebars. '''

227 for paragraph in element.find_all( 'p' ):

228 if paragraph.find_parent( [ 'aside', 'nav', 'header' ] ):

229 continue

230 return str( paragraph ) if paragraph else ''

231 return ''

232

233

234def _get_parent_content_text( element: __.typx.Any, element_type: str ) -> str:

235 ''' Gets HTML content from content element within parent. '''

236 if element.parent:

237 content_elem = element.parent.find( element_type )

238 return content_elem.decode_contents( ) if content_elem else ''

239 return ''

240

241

242def _get_parent_element_text( element: __.typx.Any, element_type: str ) -> str:

243 ''' Gets HTML content from element within parent. '''

244 if element.parent:

245 next_elem = element.parent.find( element_type )

246 return next_elem.decode_contents( ) if next_elem else ''

247 return ''

248

249

250def _get_parent_sibling_text( element: __.typx.Any, element_type: str ) -> str:

251 ''' Gets HTML content from parent's next sibling element. '''

252 if element.parent:

253 sibling = element.parent.find_next_sibling( element_type )

254 return sibling.decode_contents( ) if sibling else ''

255 return ''

256

257

258def _get_sibling_text( element: __.typx.Any, element_type: str ) -> str:

259 ''' Gets HTML content from next sibling element. '''

260 sibling = element.find_next_sibling( element_type )

261 return sibling.decode_contents( ) if sibling else ''

262

263

264def _is_api_signature( element: __.typx.Any ) -> bool:

265 ''' Determines if element is an API signature using universal patterns. '''

266 signature_config = _UNIVERSAL_PATTERNS[ 'api_signatures' ]

267 signature_classes = signature_config[ 'signature_classes' ]

268 element_classes = element.get( 'class', [ ] )

269 return any( cls in element_classes for cls in signature_classes )

270

271

272def _extract_api_signature_content( element: __.typx.Any ) -> str:

273 ''' Extracts API signature content using universal patterns. '''

274 signature_config = _UNIVERSAL_PATTERNS[ 'api_signatures' ]

275 description_selector = signature_config[ 'description_selector' ]

276 sibling = element.find_next_sibling( description_selector )

277 return sibling.decode_contents( ) if sibling else ''