Coverage for sources/librovore/structures/sphinx/extraction.py: 15%

118 statements  

« prev     ^ index     » next       coverage.py v7.10.7, created at 2025-09-28 22:09 +0000

1# vim: set filetype=python fileencoding=utf-8: 

2# -*- coding: utf-8 -*- 

3 

4#============================================================================# 

5# # 

6# Licensed under the Apache License, Version 2.0 (the "License"); # 

7# you may not use this file except in compliance with the License. # 

8# You may obtain a copy of the License at # 

9# # 

10# http://www.apache.org/licenses/LICENSE-2.0 # 

11# # 

12# Unless required by applicable law or agreed to in writing, software # 

13# distributed under the License is distributed on an "AS IS" BASIS, # 

14# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # 

15# See the License for the specific language governing permissions and # 

16# limitations under the License. # 

17# # 

18#============================================================================# 

19 

20 

21''' Documentation extraction and content retrieval. ''' 

22 

23 

24from bs4 import BeautifulSoup as _BeautifulSoup 

25 

26from . import __ 

27from . import urls as _urls 

28from .patterns import THEME_PATTERNS as _THEME_PATTERNS 

29from .patterns import UNIVERSAL_PATTERNS as _UNIVERSAL_PATTERNS 

30 

31 

32_scribe = __.acquire_scribe( __name__ ) 

33 

34 

35async def extract_contents( 

36 auxdata: __.ApplicationGlobals, 

37 source: str, 

38 objects: __.cabc.Sequence[ __.InventoryObject ], /, *, 

39 theme: __.Absential[ str ] = __.absent, 

40) -> list[ __.ContentDocument ]: 

41 ''' Extracts documentation content for specified objects. ''' 

42 base_url = _urls.normalize_base_url( source ) 

43 if not objects: return [ ] 

44 tasks = [ 

45 _extract_object_documentation( 

46 auxdata, base_url, source, obj, theme ) 

47 for obj in objects ] 

48 candidate_results = await __.asyncf.gather_async( 

49 *tasks, return_exceptions = True ) 

50 results: list[ __.ContentDocument ] = [ 

51 result.value for result in candidate_results 

52 if __.generics.is_value( result ) and result.value is not None ] 

53 return results 

54 

55 

56def parse_documentation_html( 

57 content: str, element_id: str, url: str, *, 

58 theme: __.Absential[ str ] = __.absent 

59) -> __.cabc.Mapping[ str, str ]: 

60 ''' Parses HTML content to extract documentation sections. ''' 

61 try: soup = _BeautifulSoup( content, 'lxml' ) 

62 except Exception as exc: 

63 raise __.DocumentationParseFailure( 

64 element_id, exc ) from exc 

65 # Theme should be provided from detection metadata 

66 # If absent, use None to fall back to generic detection 

67 container = _find_main_content_container( soup, theme ) 

68 if __.is_absent( container ): 

69 raise __.DocumentationContentAbsence( element_id ) 

70 element = container.find( id = element_id ) 

71 if not element: 

72 raise __.DocumentationObjectAbsence( element_id, url ) 

73 description = _extract_content_with_dsl( 

74 element, element_id, theme ) 

75 return { 

76 'description': description, 

77 'object_name': element_id, 

78 } 

79 

80 

81def _cleanup_content( 

82 content: str, 

83 cleanup_selectors: __.cabc.Sequence[ str ] 

84) -> str: 

85 ''' Removes unwanted elements from content using CSS selectors. ''' 

86 if not content.strip( ) or not cleanup_selectors: 

87 return content 

88 soup: __.typx.Any = _BeautifulSoup( content, 'lxml' ) 

89 for selector in cleanup_selectors: 

90 for element in soup.select( selector ): 

91 element.decompose( ) 

92 return str( soup ) 

93 

94 

95def _extract_content_with_dsl( 

96 element: __.typx.Any, 

97 element_id: str, 

98 theme: __.Absential[ str ] = __.absent 

99) -> str: 

100 ''' Extracts content using universal pattern configuration. ''' 

101 if element.name == 'dt' and _is_api_signature( element ): 

102 return _extract_api_signature_content( element ) 

103 description = _generic_extraction( element ) 

104 cleanup_selectors = _UNIVERSAL_PATTERNS[ 'navigation_cleanup' ][ 

105 'universal_selectors' 

106 ] 

107 return _cleanup_content( description, cleanup_selectors ) 

108 

109 

110def _extract_description_with_strategy( 

111 element: __.typx.Any, 

112 strategy: __.cabc.Mapping[ str, __.typx.Any ] 

113) -> str: 

114 ''' Extracts description using DSL strategy. ''' 

115 source_type = strategy[ 'description_source' ] 

116 element_type = strategy.get( 'description_element', 'p' ) 

117 return _get_description_by_source_type( 

118 element, source_type, element_type ) 

119 

120 

121async def _extract_object_documentation( 

122 auxdata: __.ApplicationGlobals, 

123 base_url: __.typx.Any, 

124 location: str, 

125 obj: __.InventoryObject, 

126 theme: __.Absential[ str ] = __.absent, 

127) -> __.ContentDocument | None: 

128 ''' Extracts documentation for a single object. ''' 

129 from . import conversion as _conversion 

130 doc_url = _urls.derive_documentation_url( 

131 base_url, obj.uri, obj.name ) 

132 try: 

133 html_content = ( 

134 await __.retrieve_url_as_text( 

135 auxdata.content_cache, doc_url ) ) 

136 except Exception as exc: 

137 _scribe.debug( "Failed to retrieve %s: %s", doc_url, exc ) 

138 return None 

139 anchor = doc_url.fragment or str( obj.name ) 

140 try: 

141 parsed_content = parse_documentation_html( 

142 html_content, anchor, str( doc_url ), theme = theme ) 

143 except Exception: return None 

144 description = _conversion.html_to_markdown( 

145 parsed_content[ 'description' ] ) 

146 content_id = __.produce_content_id( location, obj.name ) 

147 return __.ContentDocument( 

148 inventory_object = obj, 

149 content_id = content_id, 

150 description = description, 

151 documentation_url = doc_url.geturl( ), 

152 extraction_metadata = __.immut.Dictionary( { 

153 'theme': theme if not __.is_absent( theme ) else 'unknown', 

154 'extraction_method': 'sphinx_html_parsing', 

155 'relevance_score': 1.0, 

156 'match_reasons': [ 'direct extraction' ], 

157 } ) 

158 ) 

159 

160 

161 

162 

163def _find_main_content_container( 

164 soup: __.typx.Any, theme: __.Absential[ str ] = __.absent 

165) -> __.Absential[ __.typx.Any ]: 

166 ''' Finds main content container trying theme-specific patterns first. ''' 

167 if ( 

168 not __.is_absent( theme ) 

169 and theme in _THEME_PATTERNS[ 'content_containers' ] 

170 ): 

171 theme_selectors = _THEME_PATTERNS[ 'content_containers' ][ 

172 theme 

173 ] 

174 for selector in theme_selectors: 

175 container = soup.select_one( selector ) 

176 if container: return container 

177 content_config = _UNIVERSAL_PATTERNS[ 'content_containers' ] 

178 universal_selectors = content_config[ 'universal_selectors' ] 

179 for selector in universal_selectors: 

180 container = soup.select_one( selector ) 

181 if container: return container 

182 return __.absent 

183 

184 

185 

186 

187def _generic_extraction( element: __.typx.Any ) -> str: 

188 ''' Generic fallback extraction for unknown element types. ''' 

189 description = '' 

190 if element.parent: 

191 next_p = element.parent.find( 'p' ) 

192 if next_p: 

193 description = str( next_p ) 

194 return description 

195 

196 

197def _get_description_by_source_type( 

198 element: __.typx.Any, 

199 source_type: str, 

200 element_type: str 

201) -> str: 

202 ''' Gets description content based on source type. ''' 

203 extractors = { 

204 'next_sibling': lambda: _get_sibling_text( element, element_type ), 

205 'parent_next_sibling': lambda: _get_parent_sibling_text( 

206 element, element_type ), 

207 'parent_next_element': lambda: _get_parent_element_text( 

208 element, element_type ), 

209 'parent_content': lambda: _get_parent_content_text( 

210 element, element_type ), 

211 'first_paragraph': lambda: _get_first_paragraph_text( element ), 

212 'first_main_paragraph': lambda: _get_first_main_paragraph_text( 

213 element ), 

214 } 

215 extractor = extractors.get( source_type ) 

216 return extractor( ) if extractor else '' 

217 

218 

219def _get_first_paragraph_text( element: __.typx.Any ) -> str: 

220 ''' Gets HTML content from first paragraph within element. ''' 

221 paragraph = element.find( 'p' ) 

222 return str( paragraph ) if paragraph else '' 

223 

224 

225def _get_first_main_paragraph_text( element: __.typx.Any ) -> str: 

226 ''' Gets HTML content from first paragraph, skipping sidebars. ''' 

227 for paragraph in element.find_all( 'p' ): 

228 if paragraph.find_parent( [ 'aside', 'nav', 'header' ] ): 

229 continue 

230 return str( paragraph ) if paragraph else '' 

231 return '' 

232 

233 

234def _get_parent_content_text( element: __.typx.Any, element_type: str ) -> str: 

235 ''' Gets HTML content from content element within parent. ''' 

236 if element.parent: 

237 content_elem = element.parent.find( element_type ) 

238 return content_elem.decode_contents( ) if content_elem else '' 

239 return '' 

240 

241 

242def _get_parent_element_text( element: __.typx.Any, element_type: str ) -> str: 

243 ''' Gets HTML content from element within parent. ''' 

244 if element.parent: 

245 next_elem = element.parent.find( element_type ) 

246 return next_elem.decode_contents( ) if next_elem else '' 

247 return '' 

248 

249 

250def _get_parent_sibling_text( element: __.typx.Any, element_type: str ) -> str: 

251 ''' Gets HTML content from parent's next sibling element. ''' 

252 if element.parent: 

253 sibling = element.parent.find_next_sibling( element_type ) 

254 return sibling.decode_contents( ) if sibling else '' 

255 return '' 

256 

257 

258def _get_sibling_text( element: __.typx.Any, element_type: str ) -> str: 

259 ''' Gets HTML content from next sibling element. ''' 

260 sibling = element.find_next_sibling( element_type ) 

261 return sibling.decode_contents( ) if sibling else '' 

262 

263 

264def _is_api_signature( element: __.typx.Any ) -> bool: 

265 ''' Determines if element is an API signature using universal patterns. ''' 

266 signature_config = _UNIVERSAL_PATTERNS[ 'api_signatures' ] 

267 signature_classes = signature_config[ 'signature_classes' ] 

268 element_classes = element.get( 'class', [ ] ) 

269 return any( cls in element_classes for cls in signature_classes ) 

270 

271 

272def _extract_api_signature_content( element: __.typx.Any ) -> str: 

273 ''' Extracts API signature content using universal patterns. ''' 

274 signature_config = _UNIVERSAL_PATTERNS[ 'api_signatures' ] 

275 description_selector = signature_config[ 'description_selector' ] 

276 sibling = element.find_next_sibling( description_selector ) 

277 return sibling.decode_contents( ) if sibling else ''