Coverage for sources/librovore/structures/mkdocs/extraction.py: 11%

1# vim: set filetype=python fileencoding=utf-8:

2# -*- coding: utf-8 -*-

4#============================================================================#

5# #

6# Licensed under the Apache License, Version 2.0 (the "License"); #

7# you may not use this file except in compliance with the License. #

8# You may obtain a copy of the License at #

9# #

10# http://www.apache.org/licenses/LICENSE-2.0 #

11# #

12# Unless required by applicable law or agreed to in writing, software #

13# distributed under the License is distributed on an "AS IS" BASIS, #

14# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. #

15# See the License for the specific language governing permissions and #

16# limitations under the License. #

17# #

18#============================================================================#

21''' MkDocs documentation content extraction and processing. '''

24from bs4 import BeautifulSoup as _BeautifulSoup

26from . import __

29MATERIAL_THEME_PATTERNS: __.cabc.Mapping[

30 str, __.cabc.Mapping[ str, __.typx.Any ]

31] = __.immut.Dictionary( {

32 'material': __.immut.Dictionary( {

33 'main_content_selectors': [

34 'article[role="main"]',

35 '.md-content__inner',

36 '.md-typeset',

37 'main .md-content',

38 ],

39 'api_section_selectors': [

40 '.doc.doc-object-member',

41 '.doc.doc-children',

42 'section[id]',

43 '.highlight',

44 ],

45 'description_selectors': [

46 '.doc-contents',

47 '.doc-object-member .doc-contents',

48 'p',

49 '.admonition',

50 ],

51 'cleanup_selectors': [

52 '.md-nav',

53 '.md-header',

54 '.md-footer',

55 '.md-sidebar',

56 '.headerlink',

57 '.md-clipboard',

58 'a.md-top',

59 ],

60 'code_block_selectors': [

61 '.highlight',

62 'pre code',

63 '.codehilite',

64 ],

65 } ),

66 'readthedocs': __.immut.Dictionary( {

67 'main_content_selectors': [

68 '.wy-nav-content-wrap main',

69 '.document',

70 '[role="main"]',

71 ],

72 'api_section_selectors': [

73 '.section',

74 'dl.class',

75 'dl.function',

76 'dl.method',

77 ],

78 'description_selectors': [

79 'dd',

80 '.field-body',

81 'p',

82 ],

83 'cleanup_selectors': [

84 '.headerlink',

85 '.wy-nav-top',

86 '.wy-nav-side',

87 ],

88 'code_block_selectors': [

89 '.highlight',

90 'pre',

91 ],

92 } ),

93} )

95_GENERIC_PATTERN = __.immut.Dictionary( {

96 'main_content_selectors': [

97 'article[role="main"]',

98 'main',

99 '.content',

100 '.document',

101 'body',

102 ],

103 'api_section_selectors': [

104 'section[id]',

105 'div[id]',

106 '.doc-object-member',

107 'dl',

108 ],

109 'description_selectors': [

110 'p',

111 'dd',

112 '.description',

113 '.doc-contents',

114 ],

115 'cleanup_selectors': [

116 '.headerlink',

117 'nav',

118 'header',

119 'footer',

120 '.sidebar',

121 ],

122 'code_block_selectors': [

123 '.highlight',

124 'pre',

125 'code',

126 ],

127} )

128

129

130async def extract_contents(

131 auxdata: __.ApplicationGlobals,

132 source: str,

133 objects: __.cabc.Sequence[ __.InventoryObject ], /, *,

134 theme: __.Absential[ str ] = __.absent,

135) -> list[ __.ContentDocument ]:

136 ''' Extracts documentation content for specified objects from MkDocs. '''

137 base_url = __.normalize_base_url( source )

138 if not objects: return [ ]

139 tasks = [

140 _extract_object_documentation(

141 auxdata, base_url, source, obj, theme )

142 for obj in objects ]

143 candidate_results = await __.asyncf.gather_async(

144 *tasks, return_exceptions = True )

145 results: list[ __.ContentDocument ] = [

146 result.value for result in candidate_results

147 if __.generics.is_value( result ) and result.value is not None ]

148 return results

149

150

151def parse_mkdocs_html(

152 content: str, element_id: str, url: str, *,

153 theme: __.Absential[ str ] = __.absent

154) -> __.cabc.Mapping[ str, str ]:

155 ''' Parses MkDocs HTML content to extract documentation sections. '''

156 try: soup = _BeautifulSoup( content, 'lxml' )

157 except Exception as exc:

158 raise __.DocumentationParseFailure( element_id, exc ) from exc

159 main_container = _find_main_content_container( soup, theme )

160 if __.is_absent( main_container ):

161 raise __.DocumentationContentAbsence( element_id )

162 target_element = _find_target_element( main_container, element_id )

163 if not target_element:

164 raise __.DocumentationObjectAbsence( element_id, url )

165 description = _extract_content_from_element(

166 target_element, element_id, theme )

167 return {

168 'description': description,

169 'object_name': element_id,

170 }

175def _cleanup_content(

176 content: str,

177 cleanup_selectors: __.cabc.Sequence[ str ]

178) -> str:

179 ''' Removes unwanted elements from content. '''

180 # TODO: Implement more sophisticated cleanup

181 return content

182

183

184def _convert_to_markdown( html_content: str ) -> str:

185 ''' Converts HTML content to markdown format using markdownify. '''

186 import markdownify

187 return markdownify.markdownify( html_content, heading_style = 'ATX' )

188

189

190def _derive_documentation_url(

191 base_url: __.typx.Any, uri: str, object_name: str

192) -> __.typx.Any:

193 ''' Derives documentation URL from base URL and object URI. '''

194 if uri.endswith( '#$' ):

195 # mkdocstrings pattern - replace #$ with object name anchor

196 clean_uri = uri[ :-2 ]

197 new_path = f"{base_url.path}/{clean_uri}"

198 return base_url._replace( path = new_path, fragment = object_name )

199 if '#' in uri:

200 path_part, fragment = uri.split( '#', 1 )

201 new_path = f"{base_url.path}/{path_part}"

202 return base_url._replace( path = new_path, fragment = fragment )

203 new_path = f"{base_url.path}/{uri}"

204 return base_url._replace( path = new_path, fragment = object_name )

205

206

207def _extract_content_from_element(

208 element: __.typx.Any,

209 element_id: str,

210 theme: __.Absential[ str ] = __.absent

211) -> str:

212 ''' Extracts description content from element. '''

213 theme_name = theme if not __.is_absent( theme ) else 'material'

214 patterns = MATERIAL_THEME_PATTERNS.get( theme_name, _GENERIC_PATTERN )

215 description = _extract_description( element, patterns )

216 cleanup_selectors = __.typx.cast(

217 __.cabc.Sequence[ str ], patterns[ 'cleanup_selectors' ] )

218 return _cleanup_content( description, cleanup_selectors )

219

220

221def _extract_description(

222 element: __.typx.Any,

223 patterns: __.cabc.Mapping[ str, __.typx.Any ]

224) -> str:

225 ''' Extracts description content from element. '''

226 doc_contents = _find_doc_contents_container( element )

227 if doc_contents:

228 return doc_contents.decode_contents( )

229 descriptions = _extract_using_fallback_selectors( element, patterns )

230 return '\n\n'.join( descriptions ) if descriptions else ''

231

232

233async def _extract_object_documentation(

234 auxdata: __.ApplicationGlobals,

235 base_url: __.typx.Any,

236 location: str,

237 obj: __.InventoryObject,

238 theme: __.Absential[ str ] = __.absent,

239) -> __.ContentDocument | None:

240 ''' Extracts documentation for a single object from MkDocs site. '''

241 doc_url = _derive_documentation_url(

242 base_url, obj.uri, obj.name )

243 try:

244 html_content = (

245 await __.retrieve_url_as_text(

246 auxdata.content_cache, doc_url ) )

247 except Exception as exc:

248 __.acquire_scribe( __name__ ).debug(

249 "Failed to retrieve %s: %s", doc_url, exc )

250 return None

251 anchor = doc_url.fragment or str( obj.name )

252 try:

253 parsed_content = parse_mkdocs_html(

254 html_content, anchor, str( doc_url ), theme = theme )

255 except Exception: return None

256 description = _convert_to_markdown( parsed_content[ 'description' ] )

257 content_id = __.produce_content_id( location, obj.name )

258 return __.ContentDocument(

259 inventory_object = obj,

260 content_id = content_id,

261 description = description,

262 documentation_url = doc_url.geturl( ),

263 extraction_metadata = __.immut.Dictionary( {

264 'theme': theme if not __.is_absent( theme ) else 'unknown',

265 'extraction_method': 'mkdocs_html_parsing',

266 'relevance_score': 1.0,

267 'match_reasons': [ 'direct extraction' ],

268 } )

269 )

274def _extract_using_fallback_selectors(

275 element: __.typx.Any,

276 patterns: __.cabc.Mapping[ str, __.typx.Any ]

277) -> list[ str ]:

278 ''' Extracts description using fallback selectors. '''

279 descriptions: list[ str ] = [ ]

280 description_selectors = __.typx.cast(

281 __.cabc.Sequence[ str ], patterns[ 'description_selectors' ] )

282 for selector in description_selectors:

283 desc_elements = element.select( selector )

284 for desc_elem in desc_elements:

285 if (

286 desc_elem.get( 'class' ) and

287 'admonition-title' in desc_elem.get( 'class', [ ] )

288 ): continue

289 html_content = str( desc_elem )

290 if html_content and html_content not in descriptions:

291 descriptions.append( html_content )

292 return descriptions

293

294

295def _find_doc_contents_container( element: __.typx.Any ) -> __.typx.Any | None:

296 ''' Finds the doc-contents container for the element. '''

297 if element.name in ( 'h1', 'h2', 'h3', 'h4', 'h5', 'h6' ):

298 sibling = element.next_sibling

299 while sibling:

300 if (

301 hasattr( sibling, 'get' ) and sibling.name == 'div' and

302 'doc-contents' in sibling.get( 'class', [ ] )

303 ): return sibling

304 sibling = sibling.next_sibling

305 return element.select_one( '.doc-contents' )

306

307

308def _find_target_element(

309 container: __.typx.Any, element_id: str

310) -> __.typx.Any:

311 ''' Finds target element within main container using ID strategies. '''

312 target = container.find( id = element_id )

313 if target: return target

314 target = container.find( attrs = { 'data-toc-label': element_id } )

315 if target: return target

316 for heading in container.find_all(

317 [ 'h1', 'h2', 'h3', 'h4', 'h5', 'h6' ] ):

318 if element_id in heading.get_text( ):

319 return heading

320 for section in container.find_all( 'section' ):

321 class_attr = section.get( 'class' )

322 if class_attr and element_id in ' '.join( class_attr ):

323 return section

324 return container

325

326

327def _find_main_content_container(

328 soup: __.typx.Any, theme: __.Absential[ str ] = __.absent

329) -> __.Absential[ __.typx.Any ]:

330 ''' Finds main content container using theme-specific strategies. '''

331 theme_name = theme if not __.is_absent( theme ) else 'material'

332 patterns = MATERIAL_THEME_PATTERNS.get( theme_name, _GENERIC_PATTERN )

333 main_selectors = __.typx.cast(

334 __.cabc.Sequence[ str ], patterns[ 'main_content_selectors' ] )

335 for selector in main_selectors:

336 container = soup.select_one( selector )

337 if container: return container

338 return __.absent