Coverage for sources/librovore/structures/mkdocs/extraction.py: 11%

1# vim: set filetype=python fileencoding=utf-8:

2# -*- coding: utf-8 -*-

4#============================================================================#

5# #

6# Licensed under the Apache License, Version 2.0 (the "License"); #

7# you may not use this file except in compliance with the License. #

8# You may obtain a copy of the License at #

9# #

10# http://www.apache.org/licenses/LICENSE-2.0 #

11# #

12# Unless required by applicable law or agreed to in writing, software #

13# distributed under the License is distributed on an "AS IS" BASIS, #

14# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. #

15# See the License for the specific language governing permissions and #

16# limitations under the License. #

17# #

18#============================================================================#

21''' MkDocs documentation content extraction and processing. '''

24from bs4 import BeautifulSoup as _BeautifulSoup

26from . import __

29MATERIAL_THEME_PATTERNS: __.cabc.Mapping[

30 str, __.cabc.Mapping[ str, __.typx.Any ]

31] = __.immut.Dictionary( {

32 'material': __.immut.Dictionary( {

33 'main_content_selectors': [

34 'article[role="main"]',

35 '.md-content__inner',

36 '.md-typeset',

37 'main .md-content',

38 ],

39 'api_section_selectors': [

40 '.doc.doc-object-member',

41 '.doc.doc-children',

42 'section[id]',

43 '.highlight',

44 ],

45 'signature_selectors': [

46 '.doc-heading',

47 '.highlight .n',

48 'h1, h2, h3, h4, h5, h6',

49 'code',

50 ],

51 'description_selectors': [

52 '.doc-contents',

53 '.doc-object-member .doc-contents',

54 'p',

55 '.admonition',

56 ],

57 'cleanup_selectors': [

58 '.md-nav',

59 '.md-header',

60 '.md-footer',

61 '.md-sidebar',

62 '.headerlink',

63 '.md-clipboard',

64 'a.md-top',

65 ],

66 'code_block_selectors': [

67 '.highlight',

68 'pre code',

69 '.codehilite',

70 ],

71 } ),

72 'readthedocs': __.immut.Dictionary( {

73 'main_content_selectors': [

74 '.wy-nav-content-wrap main',

75 '.document',

76 '[role="main"]',

77 ],

78 'api_section_selectors': [

79 '.section',

80 'dl.class',

81 'dl.function',

82 'dl.method',

83 ],

84 'signature_selectors': [

85 'dt',

86 '.descname',

87 '.sig-name',

88 ],

89 'description_selectors': [

90 'dd',

91 '.field-body',

92 'p',

93 ],

94 'cleanup_selectors': [

95 '.headerlink',

96 '.wy-nav-top',

97 '.wy-nav-side',

98 ],

99 'code_block_selectors': [

100 '.highlight',

101 'pre',

102 ],

103 } ),

104} )

105

106_GENERIC_PATTERN = __.immut.Dictionary( {

107 'main_content_selectors': [

108 'article[role="main"]',

109 'main',

110 '.content',

111 '.document',

112 'body',

113 ],

114 'api_section_selectors': [

115 'section[id]',

116 'div[id]',

117 '.doc-object-member',

118 'dl',

119 ],

120 'signature_selectors': [

121 'h1, h2, h3, h4, h5, h6',

122 'dt',

123 'code',

124 '.highlight',

125 ],

126 'description_selectors': [

127 'p',

128 'dd',

129 '.description',

130 '.doc-contents',

131 ],

132 'cleanup_selectors': [

133 '.headerlink',

134 'nav',

135 'header',

136 'footer',

137 '.sidebar',

138 ],

139 'code_block_selectors': [

140 '.highlight',

141 'pre',

142 'code',

143 ],

144} )

145

146

147async def extract_contents(

148 auxdata: __.ApplicationGlobals,

149 source: str,

150 objects: __.cabc.Sequence[ __.InventoryObject ], /, *,

151 theme: __.Absential[ str ] = __.absent,

152 include_snippets: bool = True,

153) -> list[ __.ContentDocument ]:

154 ''' Extracts documentation content for specified objects from MkDocs. '''

155 base_url = __.normalize_base_url( source )

156 if not objects: return [ ]

157 tasks = [

158 _extract_object_documentation(

159 auxdata, base_url, obj, include_snippets, theme )

160 for obj in objects ]

161 candidate_results = await __.asyncf.gather_async(

162 *tasks, return_exceptions = True )

163 results: list[ __.ContentDocument ] = [

164 result.value for result in candidate_results

165 if __.generics.is_value( result ) and result.value is not None ]

166 return results

167

168

169def parse_mkdocs_html(

170 content: str, element_id: str, url: str, *,

171 theme: __.Absential[ str ] = __.absent

172) -> __.cabc.Mapping[ str, str ]:

173 ''' Parses MkDocs HTML content to extract documentation sections. '''

174 try: soup = _BeautifulSoup( content, 'lxml' )

175 except Exception as exc:

176 raise __.DocumentationParseFailure( element_id, exc ) from exc

177 main_container = _find_main_content_container( soup, theme )

178 if __.is_absent( main_container ):

179 raise __.DocumentationContentAbsence( element_id )

180 target_element = _find_target_element( main_container, element_id )

181 if not target_element:

182 raise __.DocumentationObjectAbsence( element_id, url )

183 signature, description = _extract_content_from_element(

184 target_element, element_id, theme )

185 return {

186 'signature': signature,

187 'description': description,

188 'object_name': element_id,

189 }

190

191

192def _clean_extracted_text( text: str ) -> str:

193 ''' Cleans extracted text while preserving meaningful structure. '''

194 text = text.strip( )

195 text = __.re.sub( r' +', ' ', text )

196 return __.re.sub( r'\n\s*\n', '\n\n', text )

197

198

199def _cleanup_content(

200 content: str,

201 cleanup_selectors: __.cabc.Sequence[ str ]

202) -> str:

203 ''' Removes unwanted elements from content. '''

204 # TODO: Implement more sophisticated cleanup

205 return content

206

207

208def _convert_to_markdown( html_content: str ) -> str:

209 ''' Converts HTML content to markdown format using markdownify. '''

210 import markdownify

211 return markdownify.markdownify( html_content, heading_style = 'ATX' )

212

213

214def _derive_documentation_url(

215 base_url: __.typx.Any, uri: str, object_name: str

216) -> __.typx.Any:

217 ''' Derives documentation URL from base URL and object URI. '''

218 if uri.endswith( '#$' ):

219 # mkdocstrings pattern - replace #$ with object name anchor

220 clean_uri = uri[ :-2 ]

221 new_path = f"{base_url.path}/{clean_uri}"

222 return base_url._replace( path = new_path, fragment = object_name )

223 if '#' in uri:

224 path_part, fragment = uri.split( '#', 1 )

225 new_path = f"{base_url.path}/{path_part}"

226 return base_url._replace( path = new_path, fragment = fragment )

227 new_path = f"{base_url.path}/{uri}"

228 return base_url._replace( path = new_path, fragment = object_name )

229

230

231def _extract_content_from_element(

232 element: __.typx.Any,

233 element_id: str,

234 theme: __.Absential[ str ] = __.absent

235) -> tuple[ str, str ]:

236 ''' Extracts signature and description content from element. '''

237 theme_name = theme if not __.is_absent( theme ) else 'material'

238 patterns = MATERIAL_THEME_PATTERNS.get( theme_name, _GENERIC_PATTERN )

239 signature = _extract_signature( element, patterns )

240 description = _extract_description( element, patterns )

241 cleanup_selectors = __.typx.cast(

242 __.cabc.Sequence[ str ], patterns[ 'cleanup_selectors' ] )

243 description = _cleanup_content( description, cleanup_selectors )

244 return signature, description

245

246

247def _extract_description(

248 element: __.typx.Any,

249 patterns: __.cabc.Mapping[ str, __.typx.Any ]

250) -> str:

251 ''' Extracts description content from element. '''

252 doc_contents = _find_doc_contents_container( element )

253 if doc_contents:

254 return doc_contents.decode_contents( )

255 descriptions = _extract_using_fallback_selectors( element, patterns )

256 return '\n\n'.join( descriptions ) if descriptions else ''

257

258

259async def _extract_object_documentation(

260 auxdata: __.ApplicationGlobals,

261 base_url: __.typx.Any,

262 obj: __.InventoryObject,

263 include_snippets: bool,

264 theme: __.Absential[ str ] = __.absent

265) -> __.ContentDocument | None:

266 ''' Extracts documentation for a single object from MkDocs site. '''

267 doc_url = _derive_documentation_url(

268 base_url, obj.uri, obj.name )

269 try:

270 html_content = (

271 await __.retrieve_url_as_text(

272 auxdata.content_cache, doc_url ) )

273 except Exception as exc:

274 __.acquire_scribe( __name__ ).debug(

275 "Failed to retrieve %s: %s", doc_url, exc )

276 return None

277 anchor = doc_url.fragment or str( obj.name )

278 try:

279 parsed_content = parse_mkdocs_html(

280 html_content, anchor, str( doc_url ), theme = theme )

281 except Exception: return None

282 description = _convert_to_markdown( parsed_content[ 'description' ] )

283 snippet_max_length = 200

284 if include_snippets:

285 content_snippet = (

286 description[ : snippet_max_length ] + '...'

287 if len( description ) > snippet_max_length

288 else description )

289 else: content_snippet = ''

290 return __.ContentDocument(

291 inventory_object = obj,

292 signature = parsed_content[ 'signature' ],

293 description = description,

294 content_snippet = content_snippet,

295 documentation_url = doc_url.geturl( ),

296 extraction_metadata = __.immut.Dictionary( {

297 'theme': theme if not __.is_absent( theme ) else 'unknown',

298 'extraction_method': 'mkdocs_html_parsing',

299 'relevance_score': 1.0,

300 'match_reasons': [ 'direct extraction' ],

301 } )

302 )

303

304

305def _extract_signature(

306 element: __.typx.Any,

307 patterns: __.cabc.Mapping[ str, __.typx.Any ]

308) -> str:

309 ''' Extracts signature/heading content from element. '''

310 signature_selectors = __.typx.cast(

311 __.cabc.Sequence[ str ], patterns[ 'signature_selectors' ] )

312 for selector in signature_selectors:

313 signature_elem = element.select_one( selector )

314 if signature_elem:

315 return _clean_extracted_text( signature_elem.get_text( ) )

316 return _clean_extracted_text( element.get_text( ) )

317

318

319def _extract_using_fallback_selectors(

320 element: __.typx.Any,

321 patterns: __.cabc.Mapping[ str, __.typx.Any ]

322) -> list[ str ]:

323 ''' Extracts description using fallback selectors. '''

324 descriptions: list[ str ] = [ ]

325 description_selectors = __.typx.cast(

326 __.cabc.Sequence[ str ], patterns[ 'description_selectors' ] )

327 for selector in description_selectors:

328 desc_elements = element.select( selector )

329 for desc_elem in desc_elements:

330 if (

331 desc_elem.get( 'class' ) and

332 'admonition-title' in desc_elem.get( 'class', [ ] )

333 ): continue

334 html_content = str( desc_elem )

335 if html_content and html_content not in descriptions:

336 descriptions.append( html_content )

337 return descriptions

338

339

340def _find_doc_contents_container( element: __.typx.Any ) -> __.typx.Any | None:

341 ''' Finds the doc-contents container for the element. '''

342 if element.name in ( 'h1', 'h2', 'h3', 'h4', 'h5', 'h6' ):

343 sibling = element.next_sibling

344 while sibling:

345 if (

346 hasattr( sibling, 'get' ) and sibling.name == 'div' and

347 'doc-contents' in sibling.get( 'class', [ ] )

348 ): return sibling

349 sibling = sibling.next_sibling

350 return element.select_one( '.doc-contents' )

351

352

353def _find_target_element(

354 container: __.typx.Any, element_id: str

355) -> __.typx.Any:

356 ''' Finds target element within main container using ID strategies. '''

357 target = container.find( id = element_id )

358 if target: return target

359 target = container.find( attrs = { 'data-toc-label': element_id } )

360 if target: return target

361 for heading in container.find_all(

362 [ 'h1', 'h2', 'h3', 'h4', 'h5', 'h6' ] ):

363 if element_id in heading.get_text( ):

364 return heading

365 for section in container.find_all( 'section' ):

366 class_attr = section.get( 'class' )

367 if class_attr and element_id in ' '.join( class_attr ):

368 return section

369 return container

370

371

372def _find_main_content_container(

373 soup: __.typx.Any, theme: __.Absential[ str ] = __.absent

374) -> __.Absential[ __.typx.Any ]:

375 ''' Finds main content container using theme-specific strategies. '''

376 theme_name = theme if not __.is_absent( theme ) else 'material'

377 patterns = MATERIAL_THEME_PATTERNS.get( theme_name, _GENERIC_PATTERN )

378 main_selectors = __.typx.cast(

379 __.cabc.Sequence[ str ], patterns[ 'main_content_selectors' ] )

380 for selector in main_selectors:

381 container = soup.select_one( selector )

382 if container: return container

383 return __.absent