Coverage for sources/librovore/structures/sphinx/extraction.py: 11%

139 statements  

« prev     ^ index     » next       coverage.py v7.10.4, created at 2025-08-17 23:43 +0000

1# vim: set filetype=python fileencoding=utf-8: 

2# -*- coding: utf-8 -*- 

3 

4#============================================================================# 

5# # 

6# Licensed under the Apache License, Version 2.0 (the "License"); # 

7# you may not use this file except in compliance with the License. # 

8# You may obtain a copy of the License at # 

9# # 

10# http://www.apache.org/licenses/LICENSE-2.0 # 

11# # 

12# Unless required by applicable law or agreed to in writing, software # 

13# distributed under the License is distributed on an "AS IS" BASIS, # 

14# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # 

15# See the License for the specific language governing permissions and # 

16# limitations under the License. # 

17# # 

18#============================================================================# 

19 

20 

21''' Documentation extraction and content retrieval. ''' 

22 

23 

24from bs4 import BeautifulSoup as _BeautifulSoup 

25 

26from . import __ 

27from . import urls as _urls 

28 

29 

30_scribe = __.acquire_scribe( __name__ ) 

31 

32 

33# Theme-specific content extraction patterns 

34THEME_EXTRACTION_PATTERNS: __.cabc.Mapping[ 

35 str, __.cabc.Mapping[ str, __.typx.Any ] 

36] = __.immut.Dictionary( { 

37 'pydoctheme': __.immut.Dictionary( { 

38 'anchor_elements': [ 'dt', 'a' ], 

39 'content_strategies': __.immut.Dictionary( { 

40 'dt': __.immut.Dictionary( { 

41 'signature_source': 'self', 

42 'description_source': 'next_sibling', 

43 'description_element': 'dd', 

44 } ), 

45 'a': __.immut.Dictionary( { 

46 'signature_source': 'parent_text', 

47 'description_source': 'parent_next_sibling', 

48 'description_element': 'dd', 

49 } ), 

50 } ), 

51 'cleanup_selectors': [ 'a.headerlink' ], 

52 } ), 

53 'furo': __.immut.Dictionary( { 

54 'anchor_elements': [ 'span', 'a', 'dt' ], 

55 'content_strategies': __.immut.Dictionary( { 

56 'span': __.immut.Dictionary( { 

57 'signature_source': 'parent_header', 

58 'description_source': 'parent_next_element', 

59 'description_element': 'p', 

60 'fallback_container': 'section', 

61 } ), 

62 'a': __.immut.Dictionary( { 

63 'signature_source': 'parent_text', 

64 'description_source': 'parent_next_element', 

65 'description_element': 'p', 

66 } ), 

67 'dt': __.immut.Dictionary( { 

68 'signature_source': 'self', 

69 'description_source': 'next_sibling', 

70 'description_element': 'dd', 

71 } ), 

72 } ), 

73 'cleanup_selectors': [ 'a.headerlink', '.highlight' ], 

74 } ), 

75 'sphinx_rtd_theme': __.immut.Dictionary( { 

76 'anchor_elements': [ 'dt', 'span', 'a' ], 

77 'content_strategies': __.immut.Dictionary( { 

78 'dt': __.immut.Dictionary( { 

79 'signature_source': 'self', 

80 'description_source': 'next_sibling', 

81 'description_element': 'dd', 

82 } ), 

83 'span': __.immut.Dictionary( { 

84 'signature_source': 'parent_header', 

85 'description_source': 'parent_content', 

86 'description_element': 'p', 

87 } ), 

88 } ), 

89 'cleanup_selectors': [ 'a.headerlink' ], 

90 } ), 

91} ) 

92 

93# Generic fallback pattern for unknown themes 

94_GENERIC_PATTERN = __.immut.Dictionary( { 

95 'anchor_elements': [ 'dt', 'span', 'a', 'section', 'div' ], 

96 'content_strategies': __.immut.Dictionary( { 

97 'dt': __.immut.Dictionary( { 

98 'signature_source': 'self', 

99 'description_source': 'next_sibling', 

100 'description_element': 'dd', 

101 } ), 

102 'section': __.immut.Dictionary( { 

103 'signature_source': 'first_header', 

104 'description_source': 'first_paragraph', 

105 'description_element': 'p', 

106 } ), 

107 'span': __.immut.Dictionary( { 

108 'signature_source': 'parent_header', 

109 'description_source': 'parent_next_element', 

110 'description_element': 'p', 

111 } ), 

112 'a': __.immut.Dictionary( { 

113 'signature_source': 'parent_text', 

114 'description_source': 'parent_next_element', 

115 'description_element': 'p', 

116 } ), 

117 } ), 

118 'cleanup_selectors': [ 'a.headerlink' ], 

119} ) 

120 

121 

122async def extract_contents( 

123 auxdata: __.ApplicationGlobals, 

124 source: str, 

125 objects: __.cabc.Sequence[ __.cabc.Mapping[ str, __.typx.Any ] ], /, *, 

126 theme: __.Absential[ str ] = __.absent, 

127 include_snippets: bool = True, 

128) -> list[ dict[ str, __.typx.Any ] ]: 

129 ''' Extracts documentation content for specified objects. ''' 

130 base_url = _urls.normalize_base_url( source ) 

131 if not objects: return [ ] 

132 tasks = [ 

133 _extract_object_documentation( 

134 auxdata, base_url, dict( obj ), include_snippets, theme ) 

135 for obj in objects ] 

136 candidate_results = await __.asyncf.gather_async( 

137 *tasks, return_exceptions = True ) 

138 results: list[ dict[ str, __.typx.Any ] ] = [ 

139 dict( result.value ) for result in candidate_results 

140 if __.generics.is_value( result ) and result.value is not None ] 

141 return results 

142 

143 

144def parse_documentation_html( 

145 content: str, element_id: str, url: str, *, 

146 theme: __.Absential[ str ] = __.absent 

147) -> __.cabc.Mapping[ str, str ]: 

148 ''' Parses HTML content to extract documentation sections. ''' 

149 try: soup = _BeautifulSoup( content, 'lxml' ) 

150 except Exception as exc: 

151 raise __.DocumentationParseFailure( 

152 element_id, exc ) from exc 

153 # Theme should be provided from detection metadata 

154 # If absent, use None to fall back to generic detection 

155 container = _find_main_content_container( soup, theme ) 

156 if __.is_absent( container ): 

157 raise __.DocumentationContentAbsence( element_id ) 

158 element = container.find( id = element_id ) 

159 if not element: 

160 raise __.DocumentationObjectAbsence( element_id, url ) 

161 signature, description = _extract_content_with_dsl( 

162 element, element_id, theme ) 

163 return { 

164 'signature': signature, 

165 'description': description, 

166 'object_name': element_id, 

167 } 

168 

169 

170def _cleanup_content( 

171 content: str, 

172 cleanup_selectors: __.cabc.Sequence[ str ] 

173) -> str: 

174 ''' Removes unwanted elements from content using CSS selectors. ''' 

175 # TODO: Implement CSS selector-based cleanup 

176 return content 

177 

178 

179def _extract_content_with_dsl( 

180 element: __.typx.Any, 

181 element_id: str, 

182 theme: __.Absential[ str ] = __.absent 

183) -> tuple[ str, str ]: 

184 ''' Extracts content using DSL pattern configuration. ''' 

185 theme_name = theme if not __.is_absent( theme ) else None 

186 if theme_name is not None: 

187 pattern = THEME_EXTRACTION_PATTERNS.get( theme_name, _GENERIC_PATTERN ) 

188 else: pattern = _GENERIC_PATTERN 

189 content_strategies = __.typx.cast( 

190 __.cabc.Mapping[ str, __.cabc.Mapping[ str, __.typx.Any ] ], 

191 pattern[ 'content_strategies' ] ) 

192 strategy = content_strategies.get( element.name ) 

193 if not strategy: return _generic_extraction( element ) 

194 signature = _extract_signature_with_strategy( element, strategy ) 

195 description = _extract_description_with_strategy( element, strategy ) 

196 if 'cleanup_selectors' in pattern: 

197 cleanup_selectors = __.typx.cast( 

198 __.cabc.Sequence[ str ], pattern[ 'cleanup_selectors' ] ) 

199 description = _cleanup_content( description, cleanup_selectors ) 

200 return signature, description 

201 

202 

203def _extract_description_with_strategy( 

204 element: __.typx.Any, 

205 strategy: __.cabc.Mapping[ str, __.typx.Any ] 

206) -> str: 

207 ''' Extracts description using DSL strategy. ''' 

208 source_type = __.typx.cast( str, strategy[ 'description_source' ] ) 

209 element_type = __.typx.cast( 

210 str, strategy.get( 'description_element', 'p' ) ) 

211 return _get_description_by_source_type( 

212 element, source_type, element_type ) 

213 

214 

215async def _extract_object_documentation( 

216 auxdata: __.ApplicationGlobals, 

217 base_url: __.typx.Any, 

218 obj: dict[ str, __.typx.Any ], 

219 include_snippets: bool, 

220 theme: __.Absential[ str ] = __.absent 

221) -> dict[ str, __.typx.Any ] | None: 

222 ''' Extracts documentation for a single object. ''' 

223 from . import conversion as _conversion 

224 doc_url = _urls.derive_documentation_url( 

225 base_url, obj[ 'uri' ], obj[ 'name' ] ) 

226 try: 

227 html_content = ( 

228 await __.retrieve_url_as_text( 

229 auxdata.content_cache, doc_url ) ) 

230 except Exception as exc: 

231 _scribe.debug( "Failed to retrieve %s: %s", doc_url, exc ) 

232 return None 

233 anchor = doc_url.fragment or str( obj[ 'name' ] ) 

234 try: 

235 parsed_content = parse_documentation_html( 

236 html_content, anchor, str( doc_url ), theme = theme ) 

237 except Exception: return None 

238 description = _conversion.html_to_markdown( 

239 parsed_content[ 'description' ] ) 

240 snippet_max_length = 200 

241 if include_snippets: 

242 content_snippet = ( 

243 description[ : snippet_max_length ] + '...' 

244 if len( description ) > snippet_max_length 

245 else description ) 

246 else: content_snippet = '' 

247 return { 

248 'object_name': obj[ 'name' ], 

249 'object_type': obj[ 'role' ], 

250 'domain': obj[ 'domain' ], 

251 'priority': obj[ 'priority' ], 

252 'url': doc_url.geturl( ), 

253 'signature': parsed_content[ 'signature' ], 

254 'description': description, 

255 'content_snippet': content_snippet, 

256 'relevance_score': 1.0, 

257 'match_reasons': [ 'direct extraction' ], 

258 } 

259 

260 

261def _extract_signature_with_strategy( 

262 element: __.typx.Any, 

263 strategy: __.cabc.Mapping[ str, __.typx.Any ] 

264) -> str: 

265 ''' Extracts signature using DSL strategy. ''' 

266 source_type = __.typx.cast( str, strategy[ 'signature_source' ] ) 

267 match source_type: 

268 case 'self': return _clean_extracted_text( element.get_text( ) ) 

269 case 'parent_text': 

270 return ( 

271 _clean_extracted_text( element.parent.get_text( ) ) 

272 if element.parent else '' ) 

273 case 'parent_header': 

274 if element.parent: 

275 header = element.parent.find( 

276 [ 'h1', 'h2', 'h3', 'h4', 'h5', 'h6' ] ) 

277 return ( 

278 _clean_extracted_text( header.get_text( ) ) 

279 if header else '' ) 

280 return '' 

281 case 'first_header': 

282 header = element.find( 

283 [ 'h1', 'h2', 'h3', 'h4', 'h5', 'h6' ] ) 

284 return ( 

285 _clean_extracted_text( header.get_text( ) ) 

286 if header else '' ) 

287 case _: return _clean_extracted_text( element.get_text( ) ) 

288 

289 

290def _find_main_content_container( 

291 soup: __.typx.Any, theme: __.Absential[ str ] = __.absent 

292) -> __.Absential[ __.typx.Any ]: 

293 ''' Finds the main content container using theme-specific strategies. ''' 

294 if theme == 'furo': 

295 containers = [ 

296 soup.find( 'article', { 'role': 'main' } ), 

297 soup.find( 'div', { 'id': 'furo-main-content' } ), 

298 ] 

299 elif theme == 'sphinx_rtd_theme': 

300 containers = [ 

301 soup.find( 'div', { 'class': 'document' } ), 

302 soup.find( 'div', { 'class': 'body' } ), 

303 soup.find( 'div', { 'role': 'main' } ), 

304 ] 

305 elif theme == 'pydoctheme': # Python docs 

306 containers = [ 

307 soup.find( 'div', { 'class': 'body' } ), 

308 soup.find( 'div', { 'class': 'content' } ), 

309 soup.body, # Python docs often use body directly 

310 ] 

311 elif theme == 'flask': # Flask docs 

312 containers = [ 

313 soup.find( 'div', { 'class': 'body' } ), 

314 soup.find( 'div', { 'class': 'content' } ), 

315 soup.body, 

316 ] 

317 elif theme == 'alabaster': 

318 containers = [ 

319 soup.find( 'div', { 'class': 'body' } ), 

320 soup.find( 'div', { 'class': 'content' } ), 

321 ] 

322 else: # Generic fallback for unknown themes 

323 containers = [ 

324 soup.find( 'article', { 'role': 'main' } ), # Furo theme 

325 soup.find( 'div', { 'class': 'body' } ), # Basic theme 

326 soup.find( 'div', { 'class': 'content' } ), # Nature theme 

327 soup.find( 'div', { 'class': 'main' } ), # Generic main 

328 soup.find( 'main' ), # HTML5 main element 

329 soup.find( 'div', { 'role': 'main' } ), # Role-based 

330 soup.body, # Fallback to body if nothing else works 

331 ] 

332 for container in containers: 

333 if container: return container 

334 return __.absent 

335 

336 

337def _clean_extracted_text( text: str ) -> str: 

338 ''' Cleans extracted text while preserving internal spacing. ''' 

339 # Remove leading/trailing whitespace but preserve internal spaces 

340 text = text.strip( ) 

341 # Normalize multiple spaces to single spaces 

342 text = __.re.sub( r' +', ' ', text ) 

343 # Remove excessive newlines but preserve paragraph breaks 

344 return __.re.sub( r'\n\s*\n', '\n\n', text ) 

345 

346 

347def _generic_extraction( element: __.typx.Any ) -> tuple[ str, str ]: 

348 ''' Generic fallback extraction for unknown element types. ''' 

349 signature = _clean_extracted_text( element.get_text( ) ) 

350 description = '' 

351 if element.parent: 

352 next_p = element.parent.find( 'p' ) 

353 if next_p: 

354 description = str( next_p ) 

355 return signature, description 

356 

357 

358def _get_description_by_source_type( 

359 element: __.typx.Any, 

360 source_type: str, 

361 element_type: str 

362) -> str: 

363 ''' Gets description content based on source type. ''' 

364 match source_type: 

365 case 'next_sibling': 

366 return _get_sibling_text( element, element_type ) 

367 case 'parent_next_sibling': 

368 return _get_parent_sibling_text( element, element_type ) 

369 case 'parent_next_element': 

370 return _get_parent_element_text( element, element_type ) 

371 case 'parent_content': 

372 return _get_parent_content_text( element, element_type ) 

373 case 'first_paragraph': 

374 return _get_first_paragraph_text( element ) 

375 case _: return '' 

376 

377 

378def _get_first_paragraph_text( element: __.typx.Any ) -> str: 

379 ''' Gets HTML content from first paragraph within element. ''' 

380 paragraph = element.find( 'p' ) 

381 return str( paragraph ) if paragraph else '' 

382 

383 

384def _get_parent_content_text( element: __.typx.Any, element_type: str ) -> str: 

385 ''' Gets HTML content from content element within parent. ''' 

386 if element.parent: 

387 content_elem = element.parent.find( element_type ) 

388 return content_elem.decode_contents( ) if content_elem else '' 

389 return '' 

390 

391 

392def _get_parent_element_text( element: __.typx.Any, element_type: str ) -> str: 

393 ''' Gets HTML content from element within parent. ''' 

394 if element.parent: 

395 next_elem = element.parent.find( element_type ) 

396 return next_elem.decode_contents( ) if next_elem else '' 

397 return '' 

398 

399 

400def _get_parent_sibling_text( element: __.typx.Any, element_type: str ) -> str: 

401 ''' Gets HTML content from parent's next sibling element. ''' 

402 if element.parent: 

403 sibling = element.parent.find_next_sibling( element_type ) 

404 return sibling.decode_contents( ) if sibling else '' 

405 return '' 

406 

407 

408def _get_sibling_text( element: __.typx.Any, element_type: str ) -> str: 

409 ''' Gets HTML content from next sibling element. ''' 

410 sibling = element.find_next_sibling( element_type ) 

411 return sibling.decode_contents( ) if sibling else ''