Coverage for sources/librovore/structures/mkdocs/extraction.py: 11%

120 statements  

« prev     ^ index     » next       coverage.py v7.10.5, created at 2025-08-29 01:14 +0000

1# vim: set filetype=python fileencoding=utf-8: 

2# -*- coding: utf-8 -*- 

3 

4#============================================================================# 

5# # 

6# Licensed under the Apache License, Version 2.0 (the "License"); # 

7# you may not use this file except in compliance with the License. # 

8# You may obtain a copy of the License at # 

9# # 

10# http://www.apache.org/licenses/LICENSE-2.0 # 

11# # 

12# Unless required by applicable law or agreed to in writing, software # 

13# distributed under the License is distributed on an "AS IS" BASIS, # 

14# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # 

15# See the License for the specific language governing permissions and # 

16# limitations under the License. # 

17# # 

18#============================================================================# 

19 

20 

21''' MkDocs documentation content extraction and processing. ''' 

22 

23 

24from bs4 import BeautifulSoup as _BeautifulSoup 

25 

26from . import __ 

27 

28 

29MATERIAL_THEME_PATTERNS: __.cabc.Mapping[ 

30 str, __.cabc.Mapping[ str, __.typx.Any ] 

31] = __.immut.Dictionary( { 

32 'material': __.immut.Dictionary( { 

33 'main_content_selectors': [ 

34 'article[role="main"]', 

35 '.md-content__inner', 

36 '.md-typeset', 

37 'main .md-content', 

38 ], 

39 'api_section_selectors': [ 

40 '.doc.doc-object-member', 

41 '.doc.doc-children', 

42 'section[id]', 

43 '.highlight', 

44 ], 

45 'signature_selectors': [ 

46 '.doc-heading', 

47 '.highlight .n', 

48 'h1, h2, h3, h4, h5, h6', 

49 'code', 

50 ], 

51 'description_selectors': [ 

52 '.doc-contents', 

53 '.doc-object-member .doc-contents', 

54 'p', 

55 '.admonition', 

56 ], 

57 'cleanup_selectors': [ 

58 '.md-nav', 

59 '.md-header', 

60 '.md-footer', 

61 '.md-sidebar', 

62 '.headerlink', 

63 '.md-clipboard', 

64 'a.md-top', 

65 ], 

66 'code_block_selectors': [ 

67 '.highlight', 

68 'pre code', 

69 '.codehilite', 

70 ], 

71 } ), 

72 'readthedocs': __.immut.Dictionary( { 

73 'main_content_selectors': [ 

74 '.wy-nav-content-wrap main', 

75 '.document', 

76 '[role="main"]', 

77 ], 

78 'api_section_selectors': [ 

79 '.section', 

80 'dl.class', 

81 'dl.function', 

82 'dl.method', 

83 ], 

84 'signature_selectors': [ 

85 'dt', 

86 '.descname', 

87 '.sig-name', 

88 ], 

89 'description_selectors': [ 

90 'dd', 

91 '.field-body', 

92 'p', 

93 ], 

94 'cleanup_selectors': [ 

95 '.headerlink', 

96 '.wy-nav-top', 

97 '.wy-nav-side', 

98 ], 

99 'code_block_selectors': [ 

100 '.highlight', 

101 'pre', 

102 ], 

103 } ), 

104} ) 

105 

106_GENERIC_PATTERN = __.immut.Dictionary( { 

107 'main_content_selectors': [ 

108 'article[role="main"]', 

109 'main', 

110 '.content', 

111 '.document', 

112 'body', 

113 ], 

114 'api_section_selectors': [ 

115 'section[id]', 

116 'div[id]', 

117 '.doc-object-member', 

118 'dl', 

119 ], 

120 'signature_selectors': [ 

121 'h1, h2, h3, h4, h5, h6', 

122 'dt', 

123 'code', 

124 '.highlight', 

125 ], 

126 'description_selectors': [ 

127 'p', 

128 'dd', 

129 '.description', 

130 '.doc-contents', 

131 ], 

132 'cleanup_selectors': [ 

133 '.headerlink', 

134 'nav', 

135 'header', 

136 'footer', 

137 '.sidebar', 

138 ], 

139 'code_block_selectors': [ 

140 '.highlight', 

141 'pre', 

142 'code', 

143 ], 

144} ) 

145 

146 

147async def extract_contents( 

148 auxdata: __.ApplicationGlobals, 

149 source: str, 

150 objects: __.cabc.Sequence[ __.InventoryObject ], /, *, 

151 theme: __.Absential[ str ] = __.absent, 

152 include_snippets: bool = True, 

153) -> list[ __.ContentDocument ]: 

154 ''' Extracts documentation content for specified objects from MkDocs. ''' 

155 base_url = __.normalize_base_url( source ) 

156 if not objects: return [ ] 

157 tasks = [ 

158 _extract_object_documentation( 

159 auxdata, base_url, obj, include_snippets, theme ) 

160 for obj in objects ] 

161 candidate_results = await __.asyncf.gather_async( 

162 *tasks, return_exceptions = True ) 

163 results: list[ __.ContentDocument ] = [ 

164 result.value for result in candidate_results 

165 if __.generics.is_value( result ) and result.value is not None ] 

166 return results 

167 

168 

169def parse_mkdocs_html( 

170 content: str, element_id: str, url: str, *, 

171 theme: __.Absential[ str ] = __.absent 

172) -> __.cabc.Mapping[ str, str ]: 

173 ''' Parses MkDocs HTML content to extract documentation sections. ''' 

174 try: soup = _BeautifulSoup( content, 'lxml' ) 

175 except Exception as exc: 

176 raise __.DocumentationParseFailure( element_id, exc ) from exc 

177 main_container = _find_main_content_container( soup, theme ) 

178 if __.is_absent( main_container ): 

179 raise __.DocumentationContentAbsence( element_id ) 

180 target_element = _find_target_element( main_container, element_id ) 

181 if not target_element: 

182 raise __.DocumentationObjectAbsence( element_id, url ) 

183 signature, description = _extract_content_from_element( 

184 target_element, element_id, theme ) 

185 return { 

186 'signature': signature, 

187 'description': description, 

188 'object_name': element_id, 

189 } 

190 

191 

192def _clean_extracted_text( text: str ) -> str: 

193 ''' Cleans extracted text while preserving meaningful structure. ''' 

194 text = text.strip( ) 

195 text = __.re.sub( r' +', ' ', text ) 

196 return __.re.sub( r'\n\s*\n', '\n\n', text ) 

197 

198 

199def _cleanup_content( 

200 content: str, 

201 cleanup_selectors: __.cabc.Sequence[ str ] 

202) -> str: 

203 ''' Removes unwanted elements from content. ''' 

204 # TODO: Implement more sophisticated cleanup 

205 return content 

206 

207 

208def _convert_to_markdown( html_content: str ) -> str: 

209 ''' Converts HTML content to markdown format using markdownify. ''' 

210 import markdownify 

211 return markdownify.markdownify( html_content, heading_style = 'ATX' ) 

212 

213 

214def _derive_documentation_url( 

215 base_url: __.typx.Any, uri: str, object_name: str 

216) -> __.typx.Any: 

217 ''' Derives documentation URL from base URL and object URI. ''' 

218 if uri.endswith( '#$' ): 

219 # mkdocstrings pattern - replace #$ with object name anchor 

220 clean_uri = uri[ :-2 ] 

221 new_path = f"{base_url.path}/{clean_uri}" 

222 return base_url._replace( path = new_path, fragment = object_name ) 

223 if '#' in uri: 

224 path_part, fragment = uri.split( '#', 1 ) 

225 new_path = f"{base_url.path}/{path_part}" 

226 return base_url._replace( path = new_path, fragment = fragment ) 

227 new_path = f"{base_url.path}/{uri}" 

228 return base_url._replace( path = new_path, fragment = object_name ) 

229 

230 

231def _extract_content_from_element( 

232 element: __.typx.Any, 

233 element_id: str, 

234 theme: __.Absential[ str ] = __.absent 

235) -> tuple[ str, str ]: 

236 ''' Extracts signature and description content from element. ''' 

237 theme_name = theme if not __.is_absent( theme ) else 'material' 

238 patterns = MATERIAL_THEME_PATTERNS.get( theme_name, _GENERIC_PATTERN ) 

239 signature = _extract_signature( element, patterns ) 

240 description = _extract_description( element, patterns ) 

241 cleanup_selectors = __.typx.cast( 

242 __.cabc.Sequence[ str ], patterns[ 'cleanup_selectors' ] ) 

243 description = _cleanup_content( description, cleanup_selectors ) 

244 return signature, description 

245 

246 

247def _extract_description( 

248 element: __.typx.Any, 

249 patterns: __.cabc.Mapping[ str, __.typx.Any ] 

250) -> str: 

251 ''' Extracts description content from element. ''' 

252 doc_contents = _find_doc_contents_container( element ) 

253 if doc_contents: 

254 return doc_contents.decode_contents( ) 

255 descriptions = _extract_using_fallback_selectors( element, patterns ) 

256 return '\n\n'.join( descriptions ) if descriptions else '' 

257 

258 

259async def _extract_object_documentation( 

260 auxdata: __.ApplicationGlobals, 

261 base_url: __.typx.Any, 

262 obj: __.InventoryObject, 

263 include_snippets: bool, 

264 theme: __.Absential[ str ] = __.absent 

265) -> __.ContentDocument | None: 

266 ''' Extracts documentation for a single object from MkDocs site. ''' 

267 doc_url = _derive_documentation_url( 

268 base_url, obj.uri, obj.name ) 

269 try: 

270 html_content = ( 

271 await __.retrieve_url_as_text( 

272 auxdata.content_cache, doc_url ) ) 

273 except Exception as exc: 

274 __.acquire_scribe( __name__ ).debug( 

275 "Failed to retrieve %s: %s", doc_url, exc ) 

276 return None 

277 anchor = doc_url.fragment or str( obj.name ) 

278 try: 

279 parsed_content = parse_mkdocs_html( 

280 html_content, anchor, str( doc_url ), theme = theme ) 

281 except Exception: return None 

282 description = _convert_to_markdown( parsed_content[ 'description' ] ) 

283 snippet_max_length = 200 

284 if include_snippets: 

285 content_snippet = ( 

286 description[ : snippet_max_length ] + '...' 

287 if len( description ) > snippet_max_length 

288 else description ) 

289 else: content_snippet = '' 

290 return __.ContentDocument( 

291 inventory_object = obj, 

292 signature = parsed_content[ 'signature' ], 

293 description = description, 

294 content_snippet = content_snippet, 

295 documentation_url = doc_url.geturl( ), 

296 extraction_metadata = __.immut.Dictionary( { 

297 'theme': theme if not __.is_absent( theme ) else 'unknown', 

298 'extraction_method': 'mkdocs_html_parsing', 

299 'relevance_score': 1.0, 

300 'match_reasons': [ 'direct extraction' ], 

301 } ) 

302 ) 

303 

304 

305def _extract_signature( 

306 element: __.typx.Any, 

307 patterns: __.cabc.Mapping[ str, __.typx.Any ] 

308) -> str: 

309 ''' Extracts signature/heading content from element. ''' 

310 signature_selectors = __.typx.cast( 

311 __.cabc.Sequence[ str ], patterns[ 'signature_selectors' ] ) 

312 for selector in signature_selectors: 

313 signature_elem = element.select_one( selector ) 

314 if signature_elem: 

315 return _clean_extracted_text( signature_elem.get_text( ) ) 

316 return _clean_extracted_text( element.get_text( ) ) 

317 

318 

319def _extract_using_fallback_selectors( 

320 element: __.typx.Any, 

321 patterns: __.cabc.Mapping[ str, __.typx.Any ] 

322) -> list[ str ]: 

323 ''' Extracts description using fallback selectors. ''' 

324 descriptions: list[ str ] = [ ] 

325 description_selectors = __.typx.cast( 

326 __.cabc.Sequence[ str ], patterns[ 'description_selectors' ] ) 

327 for selector in description_selectors: 

328 desc_elements = element.select( selector ) 

329 for desc_elem in desc_elements: 

330 if ( 

331 desc_elem.get( 'class' ) and 

332 'admonition-title' in desc_elem.get( 'class', [ ] ) 

333 ): continue 

334 html_content = str( desc_elem ) 

335 if html_content and html_content not in descriptions: 

336 descriptions.append( html_content ) 

337 return descriptions 

338 

339 

340def _find_doc_contents_container( element: __.typx.Any ) -> __.typx.Any | None: 

341 ''' Finds the doc-contents container for the element. ''' 

342 if element.name in ( 'h1', 'h2', 'h3', 'h4', 'h5', 'h6' ): 

343 sibling = element.next_sibling 

344 while sibling: 

345 if ( 

346 hasattr( sibling, 'get' ) and sibling.name == 'div' and 

347 'doc-contents' in sibling.get( 'class', [ ] ) 

348 ): return sibling 

349 sibling = sibling.next_sibling 

350 return element.select_one( '.doc-contents' ) 

351 

352 

353def _find_target_element( 

354 container: __.typx.Any, element_id: str 

355) -> __.typx.Any: 

356 ''' Finds target element within main container using ID strategies. ''' 

357 target = container.find( id = element_id ) 

358 if target: return target 

359 target = container.find( attrs = { 'data-toc-label': element_id } ) 

360 if target: return target 

361 for heading in container.find_all( 

362 [ 'h1', 'h2', 'h3', 'h4', 'h5', 'h6' ] ): 

363 if element_id in heading.get_text( ): 

364 return heading 

365 for section in container.find_all( 'section' ): 

366 class_attr = section.get( 'class' ) 

367 if class_attr and element_id in ' '.join( class_attr ): 

368 return section 

369 return container 

370 

371 

372def _find_main_content_container( 

373 soup: __.typx.Any, theme: __.Absential[ str ] = __.absent 

374) -> __.Absential[ __.typx.Any ]: 

375 ''' Finds main content container using theme-specific strategies. ''' 

376 theme_name = theme if not __.is_absent( theme ) else 'material' 

377 patterns = MATERIAL_THEME_PATTERNS.get( theme_name, _GENERIC_PATTERN ) 

378 main_selectors = __.typx.cast( 

379 __.cabc.Sequence[ str ], patterns[ 'main_content_selectors' ] ) 

380 for selector in main_selectors: 

381 container = soup.select_one( selector ) 

382 if container: return container 

383 return __.absent