Coverage for sources/librovore/structures/mkdocs/extraction.py: 11%

104 statements  

« prev     ^ index     » next       coverage.py v7.10.6, created at 2025-09-03 21:59 +0000

1# vim: set filetype=python fileencoding=utf-8: 

2# -*- coding: utf-8 -*- 

3 

4#============================================================================# 

5# # 

6# Licensed under the Apache License, Version 2.0 (the "License"); # 

7# you may not use this file except in compliance with the License. # 

8# You may obtain a copy of the License at # 

9# # 

10# http://www.apache.org/licenses/LICENSE-2.0 # 

11# # 

12# Unless required by applicable law or agreed to in writing, software # 

13# distributed under the License is distributed on an "AS IS" BASIS, # 

14# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # 

15# See the License for the specific language governing permissions and # 

16# limitations under the License. # 

17# # 

18#============================================================================# 

19 

20 

21''' MkDocs documentation content extraction and processing. ''' 

22 

23 

24from bs4 import BeautifulSoup as _BeautifulSoup 

25 

26from . import __ 

27 

28 

29MATERIAL_THEME_PATTERNS: __.cabc.Mapping[ 

30 str, __.cabc.Mapping[ str, __.typx.Any ] 

31] = __.immut.Dictionary( { 

32 'material': __.immut.Dictionary( { 

33 'main_content_selectors': [ 

34 'article[role="main"]', 

35 '.md-content__inner', 

36 '.md-typeset', 

37 'main .md-content', 

38 ], 

39 'api_section_selectors': [ 

40 '.doc.doc-object-member', 

41 '.doc.doc-children', 

42 'section[id]', 

43 '.highlight', 

44 ], 

45 'description_selectors': [ 

46 '.doc-contents', 

47 '.doc-object-member .doc-contents', 

48 'p', 

49 '.admonition', 

50 ], 

51 'cleanup_selectors': [ 

52 '.md-nav', 

53 '.md-header', 

54 '.md-footer', 

55 '.md-sidebar', 

56 '.headerlink', 

57 '.md-clipboard', 

58 'a.md-top', 

59 ], 

60 'code_block_selectors': [ 

61 '.highlight', 

62 'pre code', 

63 '.codehilite', 

64 ], 

65 } ), 

66 'readthedocs': __.immut.Dictionary( { 

67 'main_content_selectors': [ 

68 '.wy-nav-content-wrap main', 

69 '.document', 

70 '[role="main"]', 

71 ], 

72 'api_section_selectors': [ 

73 '.section', 

74 'dl.class', 

75 'dl.function', 

76 'dl.method', 

77 ], 

78 'description_selectors': [ 

79 'dd', 

80 '.field-body', 

81 'p', 

82 ], 

83 'cleanup_selectors': [ 

84 '.headerlink', 

85 '.wy-nav-top', 

86 '.wy-nav-side', 

87 ], 

88 'code_block_selectors': [ 

89 '.highlight', 

90 'pre', 

91 ], 

92 } ), 

93} ) 

94 

95_GENERIC_PATTERN = __.immut.Dictionary( { 

96 'main_content_selectors': [ 

97 'article[role="main"]', 

98 'main', 

99 '.content', 

100 '.document', 

101 'body', 

102 ], 

103 'api_section_selectors': [ 

104 'section[id]', 

105 'div[id]', 

106 '.doc-object-member', 

107 'dl', 

108 ], 

109 'description_selectors': [ 

110 'p', 

111 'dd', 

112 '.description', 

113 '.doc-contents', 

114 ], 

115 'cleanup_selectors': [ 

116 '.headerlink', 

117 'nav', 

118 'header', 

119 'footer', 

120 '.sidebar', 

121 ], 

122 'code_block_selectors': [ 

123 '.highlight', 

124 'pre', 

125 'code', 

126 ], 

127} ) 

128 

129 

130async def extract_contents( 

131 auxdata: __.ApplicationGlobals, 

132 source: str, 

133 objects: __.cabc.Sequence[ __.InventoryObject ], /, *, 

134 theme: __.Absential[ str ] = __.absent, 

135) -> list[ __.ContentDocument ]: 

136 ''' Extracts documentation content for specified objects from MkDocs. ''' 

137 base_url = __.normalize_base_url( source ) 

138 if not objects: return [ ] 

139 tasks = [ 

140 _extract_object_documentation( 

141 auxdata, base_url, source, obj, theme ) 

142 for obj in objects ] 

143 candidate_results = await __.asyncf.gather_async( 

144 *tasks, return_exceptions = True ) 

145 results: list[ __.ContentDocument ] = [ 

146 result.value for result in candidate_results 

147 if __.generics.is_value( result ) and result.value is not None ] 

148 return results 

149 

150 

151def parse_mkdocs_html( 

152 content: str, element_id: str, url: str, *, 

153 theme: __.Absential[ str ] = __.absent 

154) -> __.cabc.Mapping[ str, str ]: 

155 ''' Parses MkDocs HTML content to extract documentation sections. ''' 

156 try: soup = _BeautifulSoup( content, 'lxml' ) 

157 except Exception as exc: 

158 raise __.DocumentationParseFailure( element_id, exc ) from exc 

159 main_container = _find_main_content_container( soup, theme ) 

160 if __.is_absent( main_container ): 

161 raise __.DocumentationContentAbsence( element_id ) 

162 target_element = _find_target_element( main_container, element_id ) 

163 if not target_element: 

164 raise __.DocumentationObjectAbsence( element_id, url ) 

165 description = _extract_content_from_element( 

166 target_element, element_id, theme ) 

167 return { 

168 'description': description, 

169 'object_name': element_id, 

170 } 

171 

172 

173 

174 

175def _cleanup_content( 

176 content: str, 

177 cleanup_selectors: __.cabc.Sequence[ str ] 

178) -> str: 

179 ''' Removes unwanted elements from content. ''' 

180 # TODO: Implement more sophisticated cleanup 

181 return content 

182 

183 

184def _convert_to_markdown( html_content: str ) -> str: 

185 ''' Converts HTML content to markdown format using markdownify. ''' 

186 import markdownify 

187 return markdownify.markdownify( html_content, heading_style = 'ATX' ) 

188 

189 

190def _derive_documentation_url( 

191 base_url: __.typx.Any, uri: str, object_name: str 

192) -> __.typx.Any: 

193 ''' Derives documentation URL from base URL and object URI. ''' 

194 if uri.endswith( '#$' ): 

195 # mkdocstrings pattern - replace #$ with object name anchor 

196 clean_uri = uri[ :-2 ] 

197 new_path = f"{base_url.path}/{clean_uri}" 

198 return base_url._replace( path = new_path, fragment = object_name ) 

199 if '#' in uri: 

200 path_part, fragment = uri.split( '#', 1 ) 

201 new_path = f"{base_url.path}/{path_part}" 

202 return base_url._replace( path = new_path, fragment = fragment ) 

203 new_path = f"{base_url.path}/{uri}" 

204 return base_url._replace( path = new_path, fragment = object_name ) 

205 

206 

207def _extract_content_from_element( 

208 element: __.typx.Any, 

209 element_id: str, 

210 theme: __.Absential[ str ] = __.absent 

211) -> str: 

212 ''' Extracts description content from element. ''' 

213 theme_name = theme if not __.is_absent( theme ) else 'material' 

214 patterns = MATERIAL_THEME_PATTERNS.get( theme_name, _GENERIC_PATTERN ) 

215 description = _extract_description( element, patterns ) 

216 cleanup_selectors = __.typx.cast( 

217 __.cabc.Sequence[ str ], patterns[ 'cleanup_selectors' ] ) 

218 return _cleanup_content( description, cleanup_selectors ) 

219 

220 

221def _extract_description( 

222 element: __.typx.Any, 

223 patterns: __.cabc.Mapping[ str, __.typx.Any ] 

224) -> str: 

225 ''' Extracts description content from element. ''' 

226 doc_contents = _find_doc_contents_container( element ) 

227 if doc_contents: 

228 return doc_contents.decode_contents( ) 

229 descriptions = _extract_using_fallback_selectors( element, patterns ) 

230 return '\n\n'.join( descriptions ) if descriptions else '' 

231 

232 

233async def _extract_object_documentation( 

234 auxdata: __.ApplicationGlobals, 

235 base_url: __.typx.Any, 

236 location: str, 

237 obj: __.InventoryObject, 

238 theme: __.Absential[ str ] = __.absent, 

239) -> __.ContentDocument | None: 

240 ''' Extracts documentation for a single object from MkDocs site. ''' 

241 doc_url = _derive_documentation_url( 

242 base_url, obj.uri, obj.name ) 

243 try: 

244 html_content = ( 

245 await __.retrieve_url_as_text( 

246 auxdata.content_cache, doc_url ) ) 

247 except Exception as exc: 

248 __.acquire_scribe( __name__ ).debug( 

249 "Failed to retrieve %s: %s", doc_url, exc ) 

250 return None 

251 anchor = doc_url.fragment or str( obj.name ) 

252 try: 

253 parsed_content = parse_mkdocs_html( 

254 html_content, anchor, str( doc_url ), theme = theme ) 

255 except Exception: return None 

256 description = _convert_to_markdown( parsed_content[ 'description' ] ) 

257 content_id = __.produce_content_id( location, obj.name ) 

258 return __.ContentDocument( 

259 inventory_object = obj, 

260 content_id = content_id, 

261 description = description, 

262 documentation_url = doc_url.geturl( ), 

263 extraction_metadata = __.immut.Dictionary( { 

264 'theme': theme if not __.is_absent( theme ) else 'unknown', 

265 'extraction_method': 'mkdocs_html_parsing', 

266 'relevance_score': 1.0, 

267 'match_reasons': [ 'direct extraction' ], 

268 } ) 

269 ) 

270 

271 

272 

273 

274def _extract_using_fallback_selectors( 

275 element: __.typx.Any, 

276 patterns: __.cabc.Mapping[ str, __.typx.Any ] 

277) -> list[ str ]: 

278 ''' Extracts description using fallback selectors. ''' 

279 descriptions: list[ str ] = [ ] 

280 description_selectors = __.typx.cast( 

281 __.cabc.Sequence[ str ], patterns[ 'description_selectors' ] ) 

282 for selector in description_selectors: 

283 desc_elements = element.select( selector ) 

284 for desc_elem in desc_elements: 

285 if ( 

286 desc_elem.get( 'class' ) and 

287 'admonition-title' in desc_elem.get( 'class', [ ] ) 

288 ): continue 

289 html_content = str( desc_elem ) 

290 if html_content and html_content not in descriptions: 

291 descriptions.append( html_content ) 

292 return descriptions 

293 

294 

295def _find_doc_contents_container( element: __.typx.Any ) -> __.typx.Any | None: 

296 ''' Finds the doc-contents container for the element. ''' 

297 if element.name in ( 'h1', 'h2', 'h3', 'h4', 'h5', 'h6' ): 

298 sibling = element.next_sibling 

299 while sibling: 

300 if ( 

301 hasattr( sibling, 'get' ) and sibling.name == 'div' and 

302 'doc-contents' in sibling.get( 'class', [ ] ) 

303 ): return sibling 

304 sibling = sibling.next_sibling 

305 return element.select_one( '.doc-contents' ) 

306 

307 

308def _find_target_element( 

309 container: __.typx.Any, element_id: str 

310) -> __.typx.Any: 

311 ''' Finds target element within main container using ID strategies. ''' 

312 target = container.find( id = element_id ) 

313 if target: return target 

314 target = container.find( attrs = { 'data-toc-label': element_id } ) 

315 if target: return target 

316 for heading in container.find_all( 

317 [ 'h1', 'h2', 'h3', 'h4', 'h5', 'h6' ] ): 

318 if element_id in heading.get_text( ): 

319 return heading 

320 for section in container.find_all( 'section' ): 

321 class_attr = section.get( 'class' ) 

322 if class_attr and element_id in ' '.join( class_attr ): 

323 return section 

324 return container 

325 

326 

327def _find_main_content_container( 

328 soup: __.typx.Any, theme: __.Absential[ str ] = __.absent 

329) -> __.Absential[ __.typx.Any ]: 

330 ''' Finds main content container using theme-specific strategies. ''' 

331 theme_name = theme if not __.is_absent( theme ) else 'material' 

332 patterns = MATERIAL_THEME_PATTERNS.get( theme_name, _GENERIC_PATTERN ) 

333 main_selectors = __.typx.cast( 

334 __.cabc.Sequence[ str ], patterns[ 'main_content_selectors' ] ) 

335 for selector in main_selectors: 

336 container = soup.select_one( selector ) 

337 if container: return container 

338 return __.absent