Coverage for sources/librovore/structures/mkdocs/extraction.py: 10%

1# vim: set filetype=python fileencoding=utf-8:

2# -*- coding: utf-8 -*-

4#============================================================================#

5# #

6# Licensed under the Apache License, Version 2.0 (the "License"); #

7# you may not use this file except in compliance with the License. #

8# You may obtain a copy of the License at #

9# #

10# http://www.apache.org/licenses/LICENSE-2.0 #

11# #

12# Unless required by applicable law or agreed to in writing, software #

13# distributed under the License is distributed on an "AS IS" BASIS, #

14# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. #

15# See the License for the specific language governing permissions and #

16# limitations under the License. #

17# #

18#============================================================================#

21''' MkDocs documentation content extraction and processing. '''

24from bs4 import BeautifulSoup as _BeautifulSoup

26from . import __

29MATERIAL_THEME_PATTERNS: __.cabc.Mapping[

30 str, __.cabc.Mapping[ str, __.typx.Any ]

31] = __.immut.Dictionary( {

32 'material': __.immut.Dictionary( {

33 'main_content_selectors': [

34 'article[role="main"]',

35 '.md-content__inner',

36 '.md-typeset',

37 'main .md-content',

38 ],

39 'api_section_selectors': [

40 '.doc.doc-object-member',

41 '.doc.doc-children',

42 'section[id]',

43 '.highlight',

44 ],

45 'signature_selectors': [

46 '.doc-heading',

47 '.highlight .n',

48 'h1, h2, h3, h4, h5, h6',

49 'code',

50 ],

51 'description_selectors': [

52 '.doc-contents',

53 '.doc-object-member .doc-contents',

54 'p',

55 '.admonition',

56 ],

57 'cleanup_selectors': [

58 '.md-nav',

59 '.md-header',

60 '.md-footer',

61 '.md-sidebar',

62 '.headerlink',

63 '.md-clipboard',

64 'a.md-top',

65 ],

66 'code_block_selectors': [

67 '.highlight',

68 'pre code',

69 '.codehilite',

70 ],

71 } ),

72 'readthedocs': __.immut.Dictionary( {

73 'main_content_selectors': [

74 '.wy-nav-content-wrap main',

75 '.document',

76 '[role="main"]',

77 ],

78 'api_section_selectors': [

79 '.section',

80 'dl.class',

81 'dl.function',

82 'dl.method',

83 ],

84 'signature_selectors': [

85 'dt',

86 '.descname',

87 '.sig-name',

88 ],

89 'description_selectors': [

90 'dd',

91 '.field-body',

92 'p',

93 ],

94 'cleanup_selectors': [

95 '.headerlink',

96 '.wy-nav-top',

97 '.wy-nav-side',

98 ],

99 'code_block_selectors': [

100 '.highlight',

101 'pre',

102 ],

103 } ),

104} )

105

106_GENERIC_PATTERN = __.immut.Dictionary( {

107 'main_content_selectors': [

108 'article[role="main"]',

109 'main',

110 '.content',

111 '.document',

112 'body',

113 ],

114 'api_section_selectors': [

115 'section[id]',

116 'div[id]',

117 '.doc-object-member',

118 'dl',

119 ],

120 'signature_selectors': [

121 'h1, h2, h3, h4, h5, h6',

122 'dt',

123 'code',

124 '.highlight',

125 ],

126 'description_selectors': [

127 'p',

128 'dd',

129 '.description',

130 '.doc-contents',

131 ],

132 'cleanup_selectors': [

133 '.headerlink',

134 'nav',

135 'header',

136 'footer',

137 '.sidebar',

138 ],

139 'code_block_selectors': [

140 '.highlight',

141 'pre',

142 'code',

143 ],

144} )

145

146

147async def extract_contents(

148 auxdata: __.ApplicationGlobals,

149 source: str,

150 objects: __.cabc.Sequence[ __.cabc.Mapping[ str, __.typx.Any ] ], /, *,

151 theme: __.Absential[ str ] = __.absent,

152 include_snippets: bool = True,

153) -> list[ dict[ str, __.typx.Any ] ]:

154 ''' Extracts documentation content for specified objects from MkDocs. '''

155 base_url = __.normalize_base_url( source )

156 if not objects: return [ ]

157 tasks = [

158 _extract_object_documentation(

159 auxdata, base_url, dict( obj ), include_snippets, theme )

160 for obj in objects ]

161 candidate_results = await __.asyncf.gather_async(

162 *tasks, return_exceptions = True )

163 results: list[ dict[ str, __.typx.Any ] ] = [

164 dict( result.value ) for result in candidate_results

165 if __.generics.is_value( result ) and result.value is not None ]

166 return results

167

168

169def parse_mkdocs_html(

170 content: str, element_id: str, url: str, *,

171 theme: __.Absential[ str ] = __.absent

172) -> __.cabc.Mapping[ str, str ]:

173 ''' Parses MkDocs HTML content to extract documentation sections. '''

174 try: soup = _BeautifulSoup( content, 'lxml' )

175 except Exception as exc:

176 raise __.DocumentationParseFailure( element_id, exc ) from exc

177 main_container = _find_main_content_container( soup, theme )

178 if __.is_absent( main_container ):

179 raise __.DocumentationContentAbsence( element_id )

180 target_element = _find_target_element( main_container, element_id )

181 if not target_element:

182 raise __.DocumentationObjectAbsence( element_id, url )

183 signature, description = _extract_content_from_element(

184 target_element, element_id, theme )

185 return {

186 'signature': signature,

187 'description': description,

188 'object_name': element_id,

189 }

190

191

192def _clean_extracted_text( text: str ) -> str:

193 ''' Cleans extracted text while preserving meaningful structure. '''

194 text = text.strip( )

195 text = __.re.sub( r' +', ' ', text )

196 return __.re.sub( r'\n\s*\n', '\n\n', text )

197

198

199def _cleanup_content(

200 content: str,

201 cleanup_selectors: __.cabc.Sequence[ str ]

202) -> str:

203 ''' Removes unwanted elements from content. '''

204 # TODO: Implement more sophisticated cleanup

205 return content

206

207

208def _convert_to_markdown( html_content: str ) -> str:

209 ''' Converts HTML content to markdown format using markdownify. '''

210 import markdownify

211 return markdownify.markdownify( html_content, heading_style = 'ATX' )

212

213

214def _derive_documentation_url(

215 base_url: __.typx.Any, uri: str, object_name: str

216) -> __.typx.Any:

217 ''' Derives documentation URL from base URL and object URI. '''

218 if uri.endswith( '#$' ):

219 # mkdocstrings pattern - replace #$ with object name anchor

220 clean_uri = uri[ :-2 ]

221 new_path = f"{base_url.path}/{clean_uri}"

222 return base_url._replace( path = new_path, fragment = object_name )

223 if '#' in uri:

224 path_part, fragment = uri.split( '#', 1 )

225 new_path = f"{base_url.path}/{path_part}"

226 return base_url._replace( path = new_path, fragment = fragment )

227 new_path = f"{base_url.path}/{uri}"

228 return base_url._replace( path = new_path, fragment = object_name )

229

230

231def _extract_content_from_element(

232 element: __.typx.Any,

233 element_id: str,

234 theme: __.Absential[ str ] = __.absent

235) -> tuple[ str, str ]:

236 ''' Extracts signature and description content from element. '''

237 theme_name = theme if not __.is_absent( theme ) else 'material'

238 patterns = MATERIAL_THEME_PATTERNS.get( theme_name, _GENERIC_PATTERN )

239 signature = _extract_signature( element, patterns )

240 description = _extract_description( element, patterns )

241 cleanup_selectors = __.typx.cast(

242 __.cabc.Sequence[ str ], patterns[ 'cleanup_selectors' ] )

243 description = _cleanup_content( description, cleanup_selectors )

244 return signature, description

245

246

247def _extract_description(

248 element: __.typx.Any,

249 patterns: __.cabc.Mapping[ str, __.typx.Any ]

250) -> str:

251 ''' Extracts description content from element. '''

252 doc_contents = _find_doc_contents_container( element )

253 if doc_contents:

254 return doc_contents.decode_contents( )

255 descriptions = _extract_using_fallback_selectors( element, patterns )

256 return '\n\n'.join( descriptions ) if descriptions else ''

257

258

259async def _extract_object_documentation(

260 auxdata: __.ApplicationGlobals,

261 base_url: __.typx.Any,

262 obj: dict[ str, __.typx.Any ],

263 include_snippets: bool,

264 theme: __.Absential[ str ] = __.absent

265) -> dict[ str, __.typx.Any ] | None:

266 ''' Extracts documentation for a single object from MkDocs site. '''

267 doc_url = _derive_documentation_url(

268 base_url, obj[ 'uri' ], obj[ 'name' ] )

269 try:

270 html_content = (

271 await __.retrieve_url_as_text(

272 auxdata.content_cache, doc_url ) )

273 except Exception as exc:

274 __.acquire_scribe( __name__ ).debug(

275 "Failed to retrieve %s: %s", doc_url, exc )

276 return None

277 anchor = doc_url.fragment or str( obj[ 'name' ] )

278 try:

279 parsed_content = parse_mkdocs_html(

280 html_content, anchor, str( doc_url ), theme = theme )

281 except Exception: return None

282 description = _convert_to_markdown( parsed_content[ 'description' ] )

283 snippet_max_length = 200

284 if include_snippets:

285 content_snippet = (

286 description[ : snippet_max_length ] + '...'

287 if len( description ) > snippet_max_length

288 else description )

289 else: content_snippet = ''

290 return {

291 'object_name': obj[ 'name' ],

292 'object_type': obj[ 'role' ],

293 'domain': obj[ 'domain' ],

294 'priority': obj[ 'priority' ],

295 'url': doc_url.geturl( ),

296 'signature': parsed_content[ 'signature' ],

297 'description': description,

298 'content_snippet': content_snippet,

299 'relevance_score': 1.0,

300 'match_reasons': [ 'direct extraction' ],

301 }

302

303

304def _extract_paragraphs_from_doc_contents(

305 doc_contents: __.typx.Any

306) -> list[ str ]:

307 ''' Legacy function - now unused after markdownify migration. '''

308 # This function is kept for backward compatibility but is no longer used

309 # since we now extract the full doc-contents HTML in _extract_description

310 descriptions: list[ str ] = [ ]

311 for child in doc_contents.children:

312 if hasattr( child, 'name' ):

313 if (

314 child.name == 'div' and

315 'admonition' in child.get( 'class', [ ] )

316 ): continue

317 if child.name == 'p':

318 html_content = str( child )

319 if html_content and html_content not in descriptions:

320 descriptions.append( html_content )

321 return descriptions

322

323

324def _extract_signature(

325 element: __.typx.Any,

326 patterns: __.cabc.Mapping[ str, __.typx.Any ]

327) -> str:

328 ''' Extracts signature/heading content from element. '''

329 signature_selectors = __.typx.cast(

330 __.cabc.Sequence[ str ], patterns[ 'signature_selectors' ] )

331 for selector in signature_selectors:

332 signature_elem = element.select_one( selector )

333 if signature_elem:

334 return _clean_extracted_text( signature_elem.get_text( ) )

335 return _clean_extracted_text( element.get_text( ) )

336

337

338def _extract_using_fallback_selectors(

339 element: __.typx.Any,

340 patterns: __.cabc.Mapping[ str, __.typx.Any ]

341) -> list[ str ]:

342 ''' Extracts description using fallback selectors. '''

343 descriptions: list[ str ] = [ ]

344 description_selectors = __.typx.cast(

345 __.cabc.Sequence[ str ], patterns[ 'description_selectors' ] )

346 for selector in description_selectors:

347 desc_elements = element.select( selector )

348 for desc_elem in desc_elements:

349 if (

350 desc_elem.get( 'class' ) and

351 'admonition-title' in desc_elem.get( 'class', [ ] )

352 ): continue

353 html_content = str( desc_elem )

354 if html_content and html_content not in descriptions:

355 descriptions.append( html_content )

356 return descriptions

357

358

359def _find_doc_contents_container( element: __.typx.Any ) -> __.typx.Any | None:

360 ''' Finds the doc-contents container for the element. '''

361 if element.name in ( 'h1', 'h2', 'h3', 'h4', 'h5', 'h6' ):

362 sibling = element.next_sibling

363 while sibling:

364 if (

365 hasattr( sibling, 'get' ) and sibling.name == 'div' and

366 'doc-contents' in sibling.get( 'class', [ ] )

367 ): return sibling

368 sibling = sibling.next_sibling

369 return element.select_one( '.doc-contents' )

370

371

372def _find_target_element(

373 container: __.typx.Any, element_id: str

374) -> __.typx.Any:

375 ''' Finds target element within main container using ID strategies. '''

376 target = container.find( id = element_id )

377 if target: return target

378 target = container.find( attrs = { 'data-toc-label': element_id } )

379 if target: return target

380 for heading in container.find_all(

381 [ 'h1', 'h2', 'h3', 'h4', 'h5', 'h6' ] ):

382 if element_id in heading.get_text( ):

383 return heading

384 for section in container.find_all( 'section' ):

385 class_attr = section.get( 'class' )

386 if class_attr and element_id in ' '.join( class_attr ):

387 return section

388 return container

389

390

391def _find_main_content_container(

392 soup: __.typx.Any, theme: __.Absential[ str ] = __.absent

393) -> __.Absential[ __.typx.Any ]:

394 ''' Finds main content container using theme-specific strategies. '''

395 theme_name = theme if not __.is_absent( theme ) else 'material'

396 patterns = MATERIAL_THEME_PATTERNS.get( theme_name, _GENERIC_PATTERN )

397 main_selectors = __.typx.cast(

398 __.cabc.Sequence[ str ], patterns[ 'main_content_selectors' ] )

399 for selector in main_selectors:

400 container = soup.select_one( selector )

401 if container: return container

402 return __.absent