Coverage for sources/librovore/structures/sphinx/extraction.py: 11%

139 statements  

« prev     ^ index     » next       coverage.py v7.10.5, created at 2025-08-29 01:14 +0000

1# vim: set filetype=python fileencoding=utf-8: 

2# -*- coding: utf-8 -*- 

3 

4#============================================================================# 

5# # 

6# Licensed under the Apache License, Version 2.0 (the "License"); # 

7# you may not use this file except in compliance with the License. # 

8# You may obtain a copy of the License at # 

9# # 

10# http://www.apache.org/licenses/LICENSE-2.0 # 

11# # 

12# Unless required by applicable law or agreed to in writing, software # 

13# distributed under the License is distributed on an "AS IS" BASIS, # 

14# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # 

15# See the License for the specific language governing permissions and # 

16# limitations under the License. # 

17# # 

18#============================================================================# 

19 

20 

21''' Documentation extraction and content retrieval. ''' 

22 

23 

24from bs4 import BeautifulSoup as _BeautifulSoup 

25 

26from . import __ 

27from . import urls as _urls 

28 

29 

30_scribe = __.acquire_scribe( __name__ ) 

31 

32 

33# Theme-specific content extraction patterns 

34THEME_EXTRACTION_PATTERNS: __.cabc.Mapping[ 

35 str, __.cabc.Mapping[ str, __.typx.Any ] 

36] = __.immut.Dictionary( { 

37 'pydoctheme': __.immut.Dictionary( { 

38 'anchor_elements': [ 'dt', 'a' ], 

39 'content_strategies': __.immut.Dictionary( { 

40 'dt': __.immut.Dictionary( { 

41 'signature_source': 'self', 

42 'description_source': 'next_sibling', 

43 'description_element': 'dd', 

44 } ), 

45 'a': __.immut.Dictionary( { 

46 'signature_source': 'parent_text', 

47 'description_source': 'parent_next_sibling', 

48 'description_element': 'dd', 

49 } ), 

50 } ), 

51 'cleanup_selectors': [ 'a.headerlink' ], 

52 } ), 

53 'furo': __.immut.Dictionary( { 

54 'anchor_elements': [ 'span', 'a', 'dt' ], 

55 'content_strategies': __.immut.Dictionary( { 

56 'span': __.immut.Dictionary( { 

57 'signature_source': 'parent_header', 

58 'description_source': 'parent_next_element', 

59 'description_element': 'p', 

60 'fallback_container': 'section', 

61 } ), 

62 'a': __.immut.Dictionary( { 

63 'signature_source': 'parent_text', 

64 'description_source': 'parent_next_element', 

65 'description_element': 'p', 

66 } ), 

67 'dt': __.immut.Dictionary( { 

68 'signature_source': 'self', 

69 'description_source': 'next_sibling', 

70 'description_element': 'dd', 

71 } ), 

72 } ), 

73 'cleanup_selectors': [ 'a.headerlink', '.highlight' ], 

74 } ), 

75 'sphinx_rtd_theme': __.immut.Dictionary( { 

76 'anchor_elements': [ 'dt', 'span', 'a' ], 

77 'content_strategies': __.immut.Dictionary( { 

78 'dt': __.immut.Dictionary( { 

79 'signature_source': 'self', 

80 'description_source': 'next_sibling', 

81 'description_element': 'dd', 

82 } ), 

83 'span': __.immut.Dictionary( { 

84 'signature_source': 'parent_header', 

85 'description_source': 'parent_content', 

86 'description_element': 'p', 

87 } ), 

88 } ), 

89 'cleanup_selectors': [ 'a.headerlink' ], 

90 } ), 

91} ) 

92 

93# Generic fallback pattern for unknown themes 

94_GENERIC_PATTERN = __.immut.Dictionary( { 

95 'anchor_elements': [ 'dt', 'span', 'a', 'section', 'div' ], 

96 'content_strategies': __.immut.Dictionary( { 

97 'dt': __.immut.Dictionary( { 

98 'signature_source': 'self', 

99 'description_source': 'next_sibling', 

100 'description_element': 'dd', 

101 } ), 

102 'section': __.immut.Dictionary( { 

103 'signature_source': 'first_header', 

104 'description_source': 'first_paragraph', 

105 'description_element': 'p', 

106 } ), 

107 'span': __.immut.Dictionary( { 

108 'signature_source': 'parent_header', 

109 'description_source': 'parent_next_element', 

110 'description_element': 'p', 

111 } ), 

112 'a': __.immut.Dictionary( { 

113 'signature_source': 'parent_text', 

114 'description_source': 'parent_next_element', 

115 'description_element': 'p', 

116 } ), 

117 } ), 

118 'cleanup_selectors': [ 'a.headerlink' ], 

119} ) 

120 

121 

122async def extract_contents( 

123 auxdata: __.ApplicationGlobals, 

124 source: str, 

125 objects: __.cabc.Sequence[ __.InventoryObject ], /, *, 

126 theme: __.Absential[ str ] = __.absent, 

127 include_snippets: bool = True, 

128) -> list[ __.ContentDocument ]: 

129 ''' Extracts documentation content for specified objects. ''' 

130 base_url = _urls.normalize_base_url( source ) 

131 if not objects: return [ ] 

132 tasks = [ 

133 _extract_object_documentation( 

134 auxdata, base_url, obj, include_snippets, theme ) 

135 for obj in objects ] 

136 candidate_results = await __.asyncf.gather_async( 

137 *tasks, return_exceptions = True ) 

138 results: list[ __.ContentDocument ] = [ 

139 result.value for result in candidate_results 

140 if __.generics.is_value( result ) and result.value is not None ] 

141 return results 

142 

143 

144def parse_documentation_html( 

145 content: str, element_id: str, url: str, *, 

146 theme: __.Absential[ str ] = __.absent 

147) -> __.cabc.Mapping[ str, str ]: 

148 ''' Parses HTML content to extract documentation sections. ''' 

149 try: soup = _BeautifulSoup( content, 'lxml' ) 

150 except Exception as exc: 

151 raise __.DocumentationParseFailure( 

152 element_id, exc ) from exc 

153 # Theme should be provided from detection metadata 

154 # If absent, use None to fall back to generic detection 

155 container = _find_main_content_container( soup, theme ) 

156 if __.is_absent( container ): 

157 raise __.DocumentationContentAbsence( element_id ) 

158 element = container.find( id = element_id ) 

159 if not element: 

160 raise __.DocumentationObjectAbsence( element_id, url ) 

161 signature, description = _extract_content_with_dsl( 

162 element, element_id, theme ) 

163 return { 

164 'signature': signature, 

165 'description': description, 

166 'object_name': element_id, 

167 } 

168 

169 

170def _cleanup_content( 

171 content: str, 

172 cleanup_selectors: __.cabc.Sequence[ str ] 

173) -> str: 

174 ''' Removes unwanted elements from content using CSS selectors. ''' 

175 # TODO: Implement CSS selector-based cleanup 

176 return content 

177 

178 

179def _extract_content_with_dsl( 

180 element: __.typx.Any, 

181 element_id: str, 

182 theme: __.Absential[ str ] = __.absent 

183) -> tuple[ str, str ]: 

184 ''' Extracts content using DSL pattern configuration. ''' 

185 theme_name = theme if not __.is_absent( theme ) else None 

186 if theme_name is not None: 

187 pattern = THEME_EXTRACTION_PATTERNS.get( theme_name, _GENERIC_PATTERN ) 

188 else: pattern = _GENERIC_PATTERN 

189 content_strategies = __.typx.cast( 

190 __.cabc.Mapping[ str, __.cabc.Mapping[ str, __.typx.Any ] ], 

191 pattern[ 'content_strategies' ] ) 

192 strategy = content_strategies.get( element.name ) 

193 if not strategy: return _generic_extraction( element ) 

194 signature = _extract_signature_with_strategy( element, strategy ) 

195 description = _extract_description_with_strategy( element, strategy ) 

196 if 'cleanup_selectors' in pattern: 

197 cleanup_selectors = __.typx.cast( 

198 __.cabc.Sequence[ str ], pattern[ 'cleanup_selectors' ] ) 

199 description = _cleanup_content( description, cleanup_selectors ) 

200 return signature, description 

201 

202 

203def _extract_description_with_strategy( 

204 element: __.typx.Any, 

205 strategy: __.cabc.Mapping[ str, __.typx.Any ] 

206) -> str: 

207 ''' Extracts description using DSL strategy. ''' 

208 source_type = __.typx.cast( str, strategy[ 'description_source' ] ) 

209 element_type = __.typx.cast( 

210 str, strategy.get( 'description_element', 'p' ) ) 

211 return _get_description_by_source_type( 

212 element, source_type, element_type ) 

213 

214 

215async def _extract_object_documentation( 

216 auxdata: __.ApplicationGlobals, 

217 base_url: __.typx.Any, 

218 obj: __.InventoryObject, 

219 include_snippets: bool, 

220 theme: __.Absential[ str ] = __.absent 

221) -> __.ContentDocument | None: 

222 ''' Extracts documentation for a single object. ''' 

223 from . import conversion as _conversion 

224 doc_url = _urls.derive_documentation_url( 

225 base_url, obj.uri, obj.name ) 

226 try: 

227 html_content = ( 

228 await __.retrieve_url_as_text( 

229 auxdata.content_cache, doc_url ) ) 

230 except Exception as exc: 

231 _scribe.debug( "Failed to retrieve %s: %s", doc_url, exc ) 

232 return None 

233 anchor = doc_url.fragment or str( obj.name ) 

234 try: 

235 parsed_content = parse_documentation_html( 

236 html_content, anchor, str( doc_url ), theme = theme ) 

237 except Exception: return None 

238 description = _conversion.html_to_markdown( 

239 parsed_content[ 'description' ] ) 

240 snippet_max_length = 200 

241 if include_snippets: 

242 content_snippet = ( 

243 description[ : snippet_max_length ] + '...' 

244 if len( description ) > snippet_max_length 

245 else description ) 

246 else: content_snippet = '' 

247 return __.ContentDocument( 

248 inventory_object = obj, 

249 signature = parsed_content[ 'signature' ], 

250 description = description, 

251 content_snippet = content_snippet, 

252 documentation_url = doc_url.geturl( ), 

253 extraction_metadata = __.immut.Dictionary( { 

254 'theme': theme if not __.is_absent( theme ) else 'unknown', 

255 'extraction_method': 'sphinx_html_parsing', 

256 'relevance_score': 1.0, 

257 'match_reasons': [ 'direct extraction' ], 

258 } ) 

259 ) 

260 

261 

262def _extract_signature_with_strategy( 

263 element: __.typx.Any, 

264 strategy: __.cabc.Mapping[ str, __.typx.Any ] 

265) -> str: 

266 ''' Extracts signature using DSL strategy. ''' 

267 source_type = __.typx.cast( str, strategy[ 'signature_source' ] ) 

268 match source_type: 

269 case 'self': return _clean_extracted_text( element.get_text( ) ) 

270 case 'parent_text': 

271 return ( 

272 _clean_extracted_text( element.parent.get_text( ) ) 

273 if element.parent else '' ) 

274 case 'parent_header': 

275 if element.parent: 

276 header = element.parent.find( 

277 [ 'h1', 'h2', 'h3', 'h4', 'h5', 'h6' ] ) 

278 return ( 

279 _clean_extracted_text( header.get_text( ) ) 

280 if header else '' ) 

281 return '' 

282 case 'first_header': 

283 header = element.find( 

284 [ 'h1', 'h2', 'h3', 'h4', 'h5', 'h6' ] ) 

285 return ( 

286 _clean_extracted_text( header.get_text( ) ) 

287 if header else '' ) 

288 case _: return _clean_extracted_text( element.get_text( ) ) 

289 

290 

291def _find_main_content_container( 

292 soup: __.typx.Any, theme: __.Absential[ str ] = __.absent 

293) -> __.Absential[ __.typx.Any ]: 

294 ''' Finds the main content container using theme-specific strategies. ''' 

295 if theme == 'furo': 

296 containers = [ 

297 soup.find( 'article', { 'role': 'main' } ), 

298 soup.find( 'div', { 'id': 'furo-main-content' } ), 

299 ] 

300 elif theme == 'sphinx_rtd_theme': 

301 containers = [ 

302 soup.find( 'div', { 'class': 'document' } ), 

303 soup.find( 'div', { 'class': 'body' } ), 

304 soup.find( 'div', { 'role': 'main' } ), 

305 ] 

306 elif theme == 'pydoctheme': # Python docs 

307 containers = [ 

308 soup.find( 'div', { 'class': 'body' } ), 

309 soup.find( 'div', { 'class': 'content' } ), 

310 soup.body, # Python docs often use body directly 

311 ] 

312 elif theme == 'flask': # Flask docs 

313 containers = [ 

314 soup.find( 'div', { 'class': 'body' } ), 

315 soup.find( 'div', { 'class': 'content' } ), 

316 soup.body, 

317 ] 

318 elif theme == 'alabaster': 

319 containers = [ 

320 soup.find( 'div', { 'class': 'body' } ), 

321 soup.find( 'div', { 'class': 'content' } ), 

322 ] 

323 else: # Generic fallback for unknown themes 

324 containers = [ 

325 soup.find( 'article', { 'role': 'main' } ), # Furo theme 

326 soup.find( 'div', { 'class': 'body' } ), # Basic theme 

327 soup.find( 'div', { 'class': 'content' } ), # Nature theme 

328 soup.find( 'div', { 'class': 'main' } ), # Generic main 

329 soup.find( 'main' ), # HTML5 main element 

330 soup.find( 'div', { 'role': 'main' } ), # Role-based 

331 soup.body, # Fallback to body if nothing else works 

332 ] 

333 for container in containers: 

334 if container: return container 

335 return __.absent 

336 

337 

338def _clean_extracted_text( text: str ) -> str: 

339 ''' Cleans extracted text while preserving internal spacing. ''' 

340 # Remove leading/trailing whitespace but preserve internal spaces 

341 text = text.strip( ) 

342 # Normalize multiple spaces to single spaces 

343 text = __.re.sub( r' +', ' ', text ) 

344 # Remove excessive newlines but preserve paragraph breaks 

345 return __.re.sub( r'\n\s*\n', '\n\n', text ) 

346 

347 

348def _generic_extraction( element: __.typx.Any ) -> tuple[ str, str ]: 

349 ''' Generic fallback extraction for unknown element types. ''' 

350 signature = _clean_extracted_text( element.get_text( ) ) 

351 description = '' 

352 if element.parent: 

353 next_p = element.parent.find( 'p' ) 

354 if next_p: 

355 description = str( next_p ) 

356 return signature, description 

357 

358 

359def _get_description_by_source_type( 

360 element: __.typx.Any, 

361 source_type: str, 

362 element_type: str 

363) -> str: 

364 ''' Gets description content based on source type. ''' 

365 match source_type: 

366 case 'next_sibling': 

367 return _get_sibling_text( element, element_type ) 

368 case 'parent_next_sibling': 

369 return _get_parent_sibling_text( element, element_type ) 

370 case 'parent_next_element': 

371 return _get_parent_element_text( element, element_type ) 

372 case 'parent_content': 

373 return _get_parent_content_text( element, element_type ) 

374 case 'first_paragraph': 

375 return _get_first_paragraph_text( element ) 

376 case _: return '' 

377 

378 

379def _get_first_paragraph_text( element: __.typx.Any ) -> str: 

380 ''' Gets HTML content from first paragraph within element. ''' 

381 paragraph = element.find( 'p' ) 

382 return str( paragraph ) if paragraph else '' 

383 

384 

385def _get_parent_content_text( element: __.typx.Any, element_type: str ) -> str: 

386 ''' Gets HTML content from content element within parent. ''' 

387 if element.parent: 

388 content_elem = element.parent.find( element_type ) 

389 return content_elem.decode_contents( ) if content_elem else '' 

390 return '' 

391 

392 

393def _get_parent_element_text( element: __.typx.Any, element_type: str ) -> str: 

394 ''' Gets HTML content from element within parent. ''' 

395 if element.parent: 

396 next_elem = element.parent.find( element_type ) 

397 return next_elem.decode_contents( ) if next_elem else '' 

398 return '' 

399 

400 

401def _get_parent_sibling_text( element: __.typx.Any, element_type: str ) -> str: 

402 ''' Gets HTML content from parent's next sibling element. ''' 

403 if element.parent: 

404 sibling = element.parent.find_next_sibling( element_type ) 

405 return sibling.decode_contents( ) if sibling else '' 

406 return '' 

407 

408 

409def _get_sibling_text( element: __.typx.Any, element_type: str ) -> str: 

410 ''' Gets HTML content from next sibling element. ''' 

411 sibling = element.find_next_sibling( element_type ) 

412 return sibling.decode_contents( ) if sibling else ''