Coverage for sources/librovore/structures/mkdocs/extraction.py: 10%

130 statements  

« prev     ^ index     » next       coverage.py v7.10.4, created at 2025-08-20 22:48 +0000

1# vim: set filetype=python fileencoding=utf-8: 

2# -*- coding: utf-8 -*- 

3 

4#============================================================================# 

5# # 

6# Licensed under the Apache License, Version 2.0 (the "License"); # 

7# you may not use this file except in compliance with the License. # 

8# You may obtain a copy of the License at # 

9# # 

10# http://www.apache.org/licenses/LICENSE-2.0 # 

11# # 

12# Unless required by applicable law or agreed to in writing, software # 

13# distributed under the License is distributed on an "AS IS" BASIS, # 

14# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # 

15# See the License for the specific language governing permissions and # 

16# limitations under the License. # 

17# # 

18#============================================================================# 

19 

20 

21''' MkDocs documentation content extraction and processing. ''' 

22 

23 

24from bs4 import BeautifulSoup as _BeautifulSoup 

25 

26from . import __ 

27 

28 

29MATERIAL_THEME_PATTERNS: __.cabc.Mapping[ 

30 str, __.cabc.Mapping[ str, __.typx.Any ] 

31] = __.immut.Dictionary( { 

32 'material': __.immut.Dictionary( { 

33 'main_content_selectors': [ 

34 'article[role="main"]', 

35 '.md-content__inner', 

36 '.md-typeset', 

37 'main .md-content', 

38 ], 

39 'api_section_selectors': [ 

40 '.doc.doc-object-member', 

41 '.doc.doc-children', 

42 'section[id]', 

43 '.highlight', 

44 ], 

45 'signature_selectors': [ 

46 '.doc-heading', 

47 '.highlight .n', 

48 'h1, h2, h3, h4, h5, h6', 

49 'code', 

50 ], 

51 'description_selectors': [ 

52 '.doc-contents', 

53 '.doc-object-member .doc-contents', 

54 'p', 

55 '.admonition', 

56 ], 

57 'cleanup_selectors': [ 

58 '.md-nav', 

59 '.md-header', 

60 '.md-footer', 

61 '.md-sidebar', 

62 '.headerlink', 

63 '.md-clipboard', 

64 'a.md-top', 

65 ], 

66 'code_block_selectors': [ 

67 '.highlight', 

68 'pre code', 

69 '.codehilite', 

70 ], 

71 } ), 

72 'readthedocs': __.immut.Dictionary( { 

73 'main_content_selectors': [ 

74 '.wy-nav-content-wrap main', 

75 '.document', 

76 '[role="main"]', 

77 ], 

78 'api_section_selectors': [ 

79 '.section', 

80 'dl.class', 

81 'dl.function', 

82 'dl.method', 

83 ], 

84 'signature_selectors': [ 

85 'dt', 

86 '.descname', 

87 '.sig-name', 

88 ], 

89 'description_selectors': [ 

90 'dd', 

91 '.field-body', 

92 'p', 

93 ], 

94 'cleanup_selectors': [ 

95 '.headerlink', 

96 '.wy-nav-top', 

97 '.wy-nav-side', 

98 ], 

99 'code_block_selectors': [ 

100 '.highlight', 

101 'pre', 

102 ], 

103 } ), 

104} ) 

105 

106_GENERIC_PATTERN = __.immut.Dictionary( { 

107 'main_content_selectors': [ 

108 'article[role="main"]', 

109 'main', 

110 '.content', 

111 '.document', 

112 'body', 

113 ], 

114 'api_section_selectors': [ 

115 'section[id]', 

116 'div[id]', 

117 '.doc-object-member', 

118 'dl', 

119 ], 

120 'signature_selectors': [ 

121 'h1, h2, h3, h4, h5, h6', 

122 'dt', 

123 'code', 

124 '.highlight', 

125 ], 

126 'description_selectors': [ 

127 'p', 

128 'dd', 

129 '.description', 

130 '.doc-contents', 

131 ], 

132 'cleanup_selectors': [ 

133 '.headerlink', 

134 'nav', 

135 'header', 

136 'footer', 

137 '.sidebar', 

138 ], 

139 'code_block_selectors': [ 

140 '.highlight', 

141 'pre', 

142 'code', 

143 ], 

144} ) 

145 

146 

147async def extract_contents( 

148 auxdata: __.ApplicationGlobals, 

149 source: str, 

150 objects: __.cabc.Sequence[ __.cabc.Mapping[ str, __.typx.Any ] ], /, *, 

151 theme: __.Absential[ str ] = __.absent, 

152 include_snippets: bool = True, 

153) -> list[ dict[ str, __.typx.Any ] ]: 

154 ''' Extracts documentation content for specified objects from MkDocs. ''' 

155 base_url = __.normalize_base_url( source ) 

156 if not objects: return [ ] 

157 tasks = [ 

158 _extract_object_documentation( 

159 auxdata, base_url, dict( obj ), include_snippets, theme ) 

160 for obj in objects ] 

161 candidate_results = await __.asyncf.gather_async( 

162 *tasks, return_exceptions = True ) 

163 results: list[ dict[ str, __.typx.Any ] ] = [ 

164 dict( result.value ) for result in candidate_results 

165 if __.generics.is_value( result ) and result.value is not None ] 

166 return results 

167 

168 

169def parse_mkdocs_html( 

170 content: str, element_id: str, url: str, *, 

171 theme: __.Absential[ str ] = __.absent 

172) -> __.cabc.Mapping[ str, str ]: 

173 ''' Parses MkDocs HTML content to extract documentation sections. ''' 

174 try: soup = _BeautifulSoup( content, 'lxml' ) 

175 except Exception as exc: 

176 raise __.DocumentationParseFailure( element_id, exc ) from exc 

177 main_container = _find_main_content_container( soup, theme ) 

178 if __.is_absent( main_container ): 

179 raise __.DocumentationContentAbsence( element_id ) 

180 target_element = _find_target_element( main_container, element_id ) 

181 if not target_element: 

182 raise __.DocumentationObjectAbsence( element_id, url ) 

183 signature, description = _extract_content_from_element( 

184 target_element, element_id, theme ) 

185 return { 

186 'signature': signature, 

187 'description': description, 

188 'object_name': element_id, 

189 } 

190 

191 

192def _clean_extracted_text( text: str ) -> str: 

193 ''' Cleans extracted text while preserving meaningful structure. ''' 

194 text = text.strip( ) 

195 text = __.re.sub( r' +', ' ', text ) 

196 return __.re.sub( r'\n\s*\n', '\n\n', text ) 

197 

198 

199def _cleanup_content( 

200 content: str, 

201 cleanup_selectors: __.cabc.Sequence[ str ] 

202) -> str: 

203 ''' Removes unwanted elements from content. ''' 

204 # TODO: Implement more sophisticated cleanup 

205 return content 

206 

207 

208def _convert_to_markdown( html_content: str ) -> str: 

209 ''' Converts HTML content to markdown format using markdownify. ''' 

210 import markdownify 

211 return markdownify.markdownify( html_content, heading_style = 'ATX' ) 

212 

213 

214def _derive_documentation_url( 

215 base_url: __.typx.Any, uri: str, object_name: str 

216) -> __.typx.Any: 

217 ''' Derives documentation URL from base URL and object URI. ''' 

218 if uri.endswith( '#$' ): 

219 # mkdocstrings pattern - replace #$ with object name anchor 

220 clean_uri = uri[ :-2 ] 

221 new_path = f"{base_url.path}/{clean_uri}" 

222 return base_url._replace( path = new_path, fragment = object_name ) 

223 if '#' in uri: 

224 path_part, fragment = uri.split( '#', 1 ) 

225 new_path = f"{base_url.path}/{path_part}" 

226 return base_url._replace( path = new_path, fragment = fragment ) 

227 new_path = f"{base_url.path}/{uri}" 

228 return base_url._replace( path = new_path, fragment = object_name ) 

229 

230 

231def _extract_content_from_element( 

232 element: __.typx.Any, 

233 element_id: str, 

234 theme: __.Absential[ str ] = __.absent 

235) -> tuple[ str, str ]: 

236 ''' Extracts signature and description content from element. ''' 

237 theme_name = theme if not __.is_absent( theme ) else 'material' 

238 patterns = MATERIAL_THEME_PATTERNS.get( theme_name, _GENERIC_PATTERN ) 

239 signature = _extract_signature( element, patterns ) 

240 description = _extract_description( element, patterns ) 

241 cleanup_selectors = __.typx.cast( 

242 __.cabc.Sequence[ str ], patterns[ 'cleanup_selectors' ] ) 

243 description = _cleanup_content( description, cleanup_selectors ) 

244 return signature, description 

245 

246 

247def _extract_description( 

248 element: __.typx.Any, 

249 patterns: __.cabc.Mapping[ str, __.typx.Any ] 

250) -> str: 

251 ''' Extracts description content from element. ''' 

252 doc_contents = _find_doc_contents_container( element ) 

253 if doc_contents: 

254 return doc_contents.decode_contents( ) 

255 descriptions = _extract_using_fallback_selectors( element, patterns ) 

256 return '\n\n'.join( descriptions ) if descriptions else '' 

257 

258 

259async def _extract_object_documentation( 

260 auxdata: __.ApplicationGlobals, 

261 base_url: __.typx.Any, 

262 obj: dict[ str, __.typx.Any ], 

263 include_snippets: bool, 

264 theme: __.Absential[ str ] = __.absent 

265) -> dict[ str, __.typx.Any ] | None: 

266 ''' Extracts documentation for a single object from MkDocs site. ''' 

267 doc_url = _derive_documentation_url( 

268 base_url, obj[ 'uri' ], obj[ 'name' ] ) 

269 try: 

270 html_content = ( 

271 await __.retrieve_url_as_text( 

272 auxdata.content_cache, doc_url ) ) 

273 except Exception as exc: 

274 __.acquire_scribe( __name__ ).debug( 

275 "Failed to retrieve %s: %s", doc_url, exc ) 

276 return None 

277 anchor = doc_url.fragment or str( obj[ 'name' ] ) 

278 try: 

279 parsed_content = parse_mkdocs_html( 

280 html_content, anchor, str( doc_url ), theme = theme ) 

281 except Exception: return None 

282 description = _convert_to_markdown( parsed_content[ 'description' ] ) 

283 snippet_max_length = 200 

284 if include_snippets: 

285 content_snippet = ( 

286 description[ : snippet_max_length ] + '...' 

287 if len( description ) > snippet_max_length 

288 else description ) 

289 else: content_snippet = '' 

290 return { 

291 'object_name': obj[ 'name' ], 

292 'object_type': obj[ 'role' ], 

293 'domain': obj[ 'domain' ], 

294 'priority': obj[ 'priority' ], 

295 'url': doc_url.geturl( ), 

296 'signature': parsed_content[ 'signature' ], 

297 'description': description, 

298 'content_snippet': content_snippet, 

299 'relevance_score': 1.0, 

300 'match_reasons': [ 'direct extraction' ], 

301 } 

302 

303 

304def _extract_paragraphs_from_doc_contents( 

305 doc_contents: __.typx.Any 

306) -> list[ str ]: 

307 ''' Legacy function - now unused after markdownify migration. ''' 

308 # This function is kept for backward compatibility but is no longer used 

309 # since we now extract the full doc-contents HTML in _extract_description 

310 descriptions: list[ str ] = [ ] 

311 for child in doc_contents.children: 

312 if hasattr( child, 'name' ): 

313 if ( 

314 child.name == 'div' and 

315 'admonition' in child.get( 'class', [ ] ) 

316 ): continue 

317 if child.name == 'p': 

318 html_content = str( child ) 

319 if html_content and html_content not in descriptions: 

320 descriptions.append( html_content ) 

321 return descriptions 

322 

323 

324def _extract_signature( 

325 element: __.typx.Any, 

326 patterns: __.cabc.Mapping[ str, __.typx.Any ] 

327) -> str: 

328 ''' Extracts signature/heading content from element. ''' 

329 signature_selectors = __.typx.cast( 

330 __.cabc.Sequence[ str ], patterns[ 'signature_selectors' ] ) 

331 for selector in signature_selectors: 

332 signature_elem = element.select_one( selector ) 

333 if signature_elem: 

334 return _clean_extracted_text( signature_elem.get_text( ) ) 

335 return _clean_extracted_text( element.get_text( ) ) 

336 

337 

338def _extract_using_fallback_selectors( 

339 element: __.typx.Any, 

340 patterns: __.cabc.Mapping[ str, __.typx.Any ] 

341) -> list[ str ]: 

342 ''' Extracts description using fallback selectors. ''' 

343 descriptions: list[ str ] = [ ] 

344 description_selectors = __.typx.cast( 

345 __.cabc.Sequence[ str ], patterns[ 'description_selectors' ] ) 

346 for selector in description_selectors: 

347 desc_elements = element.select( selector ) 

348 for desc_elem in desc_elements: 

349 if ( 

350 desc_elem.get( 'class' ) and 

351 'admonition-title' in desc_elem.get( 'class', [ ] ) 

352 ): continue 

353 html_content = str( desc_elem ) 

354 if html_content and html_content not in descriptions: 

355 descriptions.append( html_content ) 

356 return descriptions 

357 

358 

359def _find_doc_contents_container( element: __.typx.Any ) -> __.typx.Any | None: 

360 ''' Finds the doc-contents container for the element. ''' 

361 if element.name in ( 'h1', 'h2', 'h3', 'h4', 'h5', 'h6' ): 

362 sibling = element.next_sibling 

363 while sibling: 

364 if ( 

365 hasattr( sibling, 'get' ) and sibling.name == 'div' and 

366 'doc-contents' in sibling.get( 'class', [ ] ) 

367 ): return sibling 

368 sibling = sibling.next_sibling 

369 return element.select_one( '.doc-contents' ) 

370 

371 

372def _find_target_element( 

373 container: __.typx.Any, element_id: str 

374) -> __.typx.Any: 

375 ''' Finds target element within main container using ID strategies. ''' 

376 target = container.find( id = element_id ) 

377 if target: return target 

378 target = container.find( attrs = { 'data-toc-label': element_id } ) 

379 if target: return target 

380 for heading in container.find_all( 

381 [ 'h1', 'h2', 'h3', 'h4', 'h5', 'h6' ] ): 

382 if element_id in heading.get_text( ): 

383 return heading 

384 for section in container.find_all( 'section' ): 

385 class_attr = section.get( 'class' ) 

386 if class_attr and element_id in ' '.join( class_attr ): 

387 return section 

388 return container 

389 

390 

391def _find_main_content_container( 

392 soup: __.typx.Any, theme: __.Absential[ str ] = __.absent 

393) -> __.Absential[ __.typx.Any ]: 

394 ''' Finds main content container using theme-specific strategies. ''' 

395 theme_name = theme if not __.is_absent( theme ) else 'material' 

396 patterns = MATERIAL_THEME_PATTERNS.get( theme_name, _GENERIC_PATTERN ) 

397 main_selectors = __.typx.cast( 

398 __.cabc.Sequence[ str ], patterns[ 'main_content_selectors' ] ) 

399 for selector in main_selectors: 

400 container = soup.select_one( selector ) 

401 if container: return container 

402 return __.absent