Coverage for sources/librovore/structures/sphinx/extraction.py: 12%

115 statements  

« prev     ^ index     » next       coverage.py v7.10.6, created at 2025-09-06 02:25 +0000

1# vim: set filetype=python fileencoding=utf-8: 

2# -*- coding: utf-8 -*- 

3 

4#============================================================================# 

5# # 

6# Licensed under the Apache License, Version 2.0 (the "License"); # 

7# you may not use this file except in compliance with the License. # 

8# You may obtain a copy of the License at # 

9# # 

10# http://www.apache.org/licenses/LICENSE-2.0 # 

11# # 

12# Unless required by applicable law or agreed to in writing, software # 

13# distributed under the License is distributed on an "AS IS" BASIS, # 

14# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # 

15# See the License for the specific language governing permissions and # 

16# limitations under the License. # 

17# # 

18#============================================================================# 

19 

20 

21''' Documentation extraction and content retrieval. ''' 

22 

23 

24from bs4 import BeautifulSoup as _BeautifulSoup 

25 

26from . import __ 

27from . import urls as _urls 

28 

29 

30_scribe = __.acquire_scribe( __name__ ) 

31 

32 

33# Theme-specific content extraction patterns 

34THEME_EXTRACTION_PATTERNS: __.cabc.Mapping[ 

35 str, __.cabc.Mapping[ str, __.typx.Any ] 

36] = __.immut.Dictionary( { 

37 'pydoctheme': __.immut.Dictionary( { 

38 'anchor_elements': [ 'dt', 'a' ], 

39 'content_strategies': __.immut.Dictionary( { 

40 'dt': __.immut.Dictionary( { 

41 'description_source': 'next_sibling', 

42 'description_element': 'dd', 

43 } ), 

44 'a': __.immut.Dictionary( { 

45 'description_source': 'parent_next_sibling', 

46 'description_element': 'dd', 

47 } ), 

48 } ), 

49 'cleanup_selectors': [ 'a.headerlink' ], 

50 } ), 

51 'furo': __.immut.Dictionary( { 

52 'anchor_elements': [ 'span', 'a', 'dt' ], 

53 'content_strategies': __.immut.Dictionary( { 

54 'span': __.immut.Dictionary( { 

55 'description_source': 'parent_next_element', 

56 'description_element': 'p', 

57 'fallback_container': 'section', 

58 } ), 

59 'a': __.immut.Dictionary( { 

60 'description_source': 'parent_next_element', 

61 'description_element': 'p', 

62 } ), 

63 'dt': __.immut.Dictionary( { 

64 'description_source': 'next_sibling', 

65 'description_element': 'dd', 

66 } ), 

67 } ), 

68 'cleanup_selectors': [ 'a.headerlink', '.highlight' ], 

69 } ), 

70 'sphinx_rtd_theme': __.immut.Dictionary( { 

71 'anchor_elements': [ 'dt', 'span', 'a' ], 

72 'content_strategies': __.immut.Dictionary( { 

73 'dt': __.immut.Dictionary( { 

74 'description_source': 'next_sibling', 

75 'description_element': 'dd', 

76 } ), 

77 'span': __.immut.Dictionary( { 

78 'description_source': 'parent_content', 

79 'description_element': 'p', 

80 } ), 

81 } ), 

82 'cleanup_selectors': [ 'a.headerlink' ], 

83 } ), 

84} ) 

85 

86# Generic fallback pattern for unknown themes 

87_GENERIC_PATTERN = __.immut.Dictionary( { 

88 'anchor_elements': [ 'dt', 'span', 'a', 'section', 'div' ], 

89 'content_strategies': __.immut.Dictionary( { 

90 'dt': __.immut.Dictionary( { 

91 'description_source': 'next_sibling', 

92 'description_element': 'dd', 

93 } ), 

94 'section': __.immut.Dictionary( { 

95 'description_source': 'first_paragraph', 

96 'description_element': 'p', 

97 } ), 

98 'span': __.immut.Dictionary( { 

99 'description_source': 'parent_next_element', 

100 'description_element': 'p', 

101 } ), 

102 'a': __.immut.Dictionary( { 

103 'description_source': 'parent_next_element', 

104 'description_element': 'p', 

105 } ), 

106 } ), 

107 'cleanup_selectors': [ 'a.headerlink' ], 

108} ) 

109 

110 

111async def extract_contents( 

112 auxdata: __.ApplicationGlobals, 

113 source: str, 

114 objects: __.cabc.Sequence[ __.InventoryObject ], /, *, 

115 theme: __.Absential[ str ] = __.absent, 

116) -> list[ __.ContentDocument ]: 

117 ''' Extracts documentation content for specified objects. ''' 

118 base_url = _urls.normalize_base_url( source ) 

119 if not objects: return [ ] 

120 tasks = [ 

121 _extract_object_documentation( 

122 auxdata, base_url, source, obj, theme ) 

123 for obj in objects ] 

124 candidate_results = await __.asyncf.gather_async( 

125 *tasks, return_exceptions = True ) 

126 results: list[ __.ContentDocument ] = [ 

127 result.value for result in candidate_results 

128 if __.generics.is_value( result ) and result.value is not None ] 

129 return results 

130 

131 

132def parse_documentation_html( 

133 content: str, element_id: str, url: str, *, 

134 theme: __.Absential[ str ] = __.absent 

135) -> __.cabc.Mapping[ str, str ]: 

136 ''' Parses HTML content to extract documentation sections. ''' 

137 try: soup = _BeautifulSoup( content, 'lxml' ) 

138 except Exception as exc: 

139 raise __.DocumentationParseFailure( 

140 element_id, exc ) from exc 

141 # Theme should be provided from detection metadata 

142 # If absent, use None to fall back to generic detection 

143 container = _find_main_content_container( soup, theme ) 

144 if __.is_absent( container ): 

145 raise __.DocumentationContentAbsence( element_id ) 

146 element = container.find( id = element_id ) 

147 if not element: 

148 raise __.DocumentationObjectAbsence( element_id, url ) 

149 description = _extract_content_with_dsl( 

150 element, element_id, theme ) 

151 return { 

152 'description': description, 

153 'object_name': element_id, 

154 } 

155 

156 

157def _cleanup_content( 

158 content: str, 

159 cleanup_selectors: __.cabc.Sequence[ str ] 

160) -> str: 

161 ''' Removes unwanted elements from content using CSS selectors. ''' 

162 # TODO: Implement CSS selector-based cleanup 

163 return content 

164 

165 

166def _extract_content_with_dsl( 

167 element: __.typx.Any, 

168 element_id: str, 

169 theme: __.Absential[ str ] = __.absent 

170) -> str: 

171 ''' Extracts content using DSL pattern configuration. ''' 

172 theme_name = theme if not __.is_absent( theme ) else None 

173 if theme_name is not None: 

174 pattern = THEME_EXTRACTION_PATTERNS.get( theme_name, _GENERIC_PATTERN ) 

175 else: pattern = _GENERIC_PATTERN 

176 content_strategies = __.typx.cast( 

177 __.cabc.Mapping[ str, __.cabc.Mapping[ str, __.typx.Any ] ], 

178 pattern[ 'content_strategies' ] ) 

179 strategy = content_strategies.get( element.name ) 

180 if not strategy: return _generic_extraction( element ) 

181 description = _extract_description_with_strategy( element, strategy ) 

182 if 'cleanup_selectors' in pattern: 

183 cleanup_selectors = __.typx.cast( 

184 __.cabc.Sequence[ str ], pattern[ 'cleanup_selectors' ] ) 

185 description = _cleanup_content( description, cleanup_selectors ) 

186 return description 

187 

188 

189def _extract_description_with_strategy( 

190 element: __.typx.Any, 

191 strategy: __.cabc.Mapping[ str, __.typx.Any ] 

192) -> str: 

193 ''' Extracts description using DSL strategy. ''' 

194 source_type = __.typx.cast( str, strategy[ 'description_source' ] ) 

195 element_type = __.typx.cast( 

196 str, strategy.get( 'description_element', 'p' ) ) 

197 return _get_description_by_source_type( 

198 element, source_type, element_type ) 

199 

200 

201async def _extract_object_documentation( 

202 auxdata: __.ApplicationGlobals, 

203 base_url: __.typx.Any, 

204 location: str, 

205 obj: __.InventoryObject, 

206 theme: __.Absential[ str ] = __.absent, 

207) -> __.ContentDocument | None: 

208 ''' Extracts documentation for a single object. ''' 

209 from . import conversion as _conversion 

210 doc_url = _urls.derive_documentation_url( 

211 base_url, obj.uri, obj.name ) 

212 try: 

213 html_content = ( 

214 await __.retrieve_url_as_text( 

215 auxdata.content_cache, doc_url ) ) 

216 except Exception as exc: 

217 _scribe.debug( "Failed to retrieve %s: %s", doc_url, exc ) 

218 return None 

219 anchor = doc_url.fragment or str( obj.name ) 

220 try: 

221 parsed_content = parse_documentation_html( 

222 html_content, anchor, str( doc_url ), theme = theme ) 

223 except Exception: return None 

224 description = _conversion.html_to_markdown( 

225 parsed_content[ 'description' ] ) 

226 content_id = __.produce_content_id( location, obj.name ) 

227 return __.ContentDocument( 

228 inventory_object = obj, 

229 content_id = content_id, 

230 description = description, 

231 documentation_url = doc_url.geturl( ), 

232 extraction_metadata = __.immut.Dictionary( { 

233 'theme': theme if not __.is_absent( theme ) else 'unknown', 

234 'extraction_method': 'sphinx_html_parsing', 

235 'relevance_score': 1.0, 

236 'match_reasons': [ 'direct extraction' ], 

237 } ) 

238 ) 

239 

240 

241 

242 

243def _find_main_content_container( 

244 soup: __.typx.Any, theme: __.Absential[ str ] = __.absent 

245) -> __.Absential[ __.typx.Any ]: 

246 ''' Finds the main content container using theme-specific strategies. ''' 

247 if theme == 'furo': 

248 containers = [ 

249 soup.find( 'article', { 'role': 'main' } ), 

250 soup.find( 'div', { 'id': 'furo-main-content' } ), 

251 ] 

252 elif theme == 'sphinx_rtd_theme': 

253 containers = [ 

254 soup.find( 'div', { 'class': 'document' } ), 

255 soup.find( 'div', { 'class': 'body' } ), 

256 soup.find( 'div', { 'role': 'main' } ), 

257 ] 

258 elif theme == 'pydoctheme': # Python docs 

259 containers = [ 

260 soup.find( 'div', { 'class': 'body' } ), 

261 soup.find( 'div', { 'class': 'content' } ), 

262 soup.body, # Python docs often use body directly 

263 ] 

264 elif theme == 'flask': # Flask docs 

265 containers = [ 

266 soup.find( 'div', { 'class': 'body' } ), 

267 soup.find( 'div', { 'class': 'content' } ), 

268 soup.body, 

269 ] 

270 elif theme == 'alabaster': 

271 containers = [ 

272 soup.find( 'div', { 'class': 'body' } ), 

273 soup.find( 'div', { 'class': 'content' } ), 

274 ] 

275 else: # Generic fallback for unknown themes 

276 containers = [ 

277 soup.find( 'article', { 'role': 'main' } ), # Furo theme 

278 soup.find( 'div', { 'class': 'body' } ), # Basic theme 

279 soup.find( 'div', { 'class': 'content' } ), # Nature theme 

280 soup.find( 'div', { 'class': 'main' } ), # Generic main 

281 soup.find( 'main' ), # HTML5 main element 

282 soup.find( 'div', { 'role': 'main' } ), # Role-based 

283 soup.body, # Fallback to body if nothing else works 

284 ] 

285 for container in containers: 

286 if container: return container 

287 return __.absent 

288 

289 

290 

291 

292def _generic_extraction( element: __.typx.Any ) -> str: 

293 ''' Generic fallback extraction for unknown element types. ''' 

294 description = '' 

295 if element.parent: 

296 next_p = element.parent.find( 'p' ) 

297 if next_p: 

298 description = str( next_p ) 

299 return description 

300 

301 

302def _get_description_by_source_type( 

303 element: __.typx.Any, 

304 source_type: str, 

305 element_type: str 

306) -> str: 

307 ''' Gets description content based on source type. ''' 

308 match source_type: 

309 case 'next_sibling': 

310 return _get_sibling_text( element, element_type ) 

311 case 'parent_next_sibling': 

312 return _get_parent_sibling_text( element, element_type ) 

313 case 'parent_next_element': 

314 return _get_parent_element_text( element, element_type ) 

315 case 'parent_content': 

316 return _get_parent_content_text( element, element_type ) 

317 case 'first_paragraph': 

318 return _get_first_paragraph_text( element ) 

319 case _: return '' 

320 

321 

322def _get_first_paragraph_text( element: __.typx.Any ) -> str: 

323 ''' Gets HTML content from first paragraph within element. ''' 

324 paragraph = element.find( 'p' ) 

325 return str( paragraph ) if paragraph else '' 

326 

327 

328def _get_parent_content_text( element: __.typx.Any, element_type: str ) -> str: 

329 ''' Gets HTML content from content element within parent. ''' 

330 if element.parent: 

331 content_elem = element.parent.find( element_type ) 

332 return content_elem.decode_contents( ) if content_elem else '' 

333 return '' 

334 

335 

336def _get_parent_element_text( element: __.typx.Any, element_type: str ) -> str: 

337 ''' Gets HTML content from element within parent. ''' 

338 if element.parent: 

339 next_elem = element.parent.find( element_type ) 

340 return next_elem.decode_contents( ) if next_elem else '' 

341 return '' 

342 

343 

344def _get_parent_sibling_text( element: __.typx.Any, element_type: str ) -> str: 

345 ''' Gets HTML content from parent's next sibling element. ''' 

346 if element.parent: 

347 sibling = element.parent.find_next_sibling( element_type ) 

348 return sibling.decode_contents( ) if sibling else '' 

349 return '' 

350 

351 

352def _get_sibling_text( element: __.typx.Any, element_type: str ) -> str: 

353 ''' Gets HTML content from next sibling element. ''' 

354 sibling = element.find_next_sibling( element_type ) 

355 return sibling.decode_contents( ) if sibling else ''