Coverage for sources/librovore/structures/sphinx/extraction.py: 12%

1# vim: set filetype=python fileencoding=utf-8:

2# -*- coding: utf-8 -*-

4#============================================================================#

5# #

6# Licensed under the Apache License, Version 2.0 (the "License"); #

7# you may not use this file except in compliance with the License. #

8# You may obtain a copy of the License at #

9# #

10# http://www.apache.org/licenses/LICENSE-2.0 #

11# #

12# Unless required by applicable law or agreed to in writing, software #

13# distributed under the License is distributed on an "AS IS" BASIS, #

14# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. #

15# See the License for the specific language governing permissions and #

16# limitations under the License. #

17# #

18#============================================================================#

21''' Documentation extraction and content retrieval. '''

24from bs4 import BeautifulSoup as _BeautifulSoup

26from . import __

27from . import urls as _urls

30_scribe = __.acquire_scribe( __name__ )

33# Theme-specific content extraction patterns

34THEME_EXTRACTION_PATTERNS: __.cabc.Mapping[

35 str, __.cabc.Mapping[ str, __.typx.Any ]

36] = __.immut.Dictionary( {

37 'pydoctheme': __.immut.Dictionary( {

38 'anchor_elements': [ 'dt', 'a' ],

39 'content_strategies': __.immut.Dictionary( {

40 'dt': __.immut.Dictionary( {

41 'description_source': 'next_sibling',

42 'description_element': 'dd',

43 } ),

44 'a': __.immut.Dictionary( {

45 'description_source': 'parent_next_sibling',

46 'description_element': 'dd',

47 } ),

48 } ),

49 'cleanup_selectors': [ 'a.headerlink' ],

50 } ),

51 'furo': __.immut.Dictionary( {

52 'anchor_elements': [ 'span', 'a', 'dt' ],

53 'content_strategies': __.immut.Dictionary( {

54 'span': __.immut.Dictionary( {

55 'description_source': 'parent_next_element',

56 'description_element': 'p',

57 'fallback_container': 'section',

58 } ),

59 'a': __.immut.Dictionary( {

60 'description_source': 'parent_next_element',

61 'description_element': 'p',

62 } ),

63 'dt': __.immut.Dictionary( {

64 'description_source': 'next_sibling',

65 'description_element': 'dd',

66 } ),

67 } ),

68 'cleanup_selectors': [ 'a.headerlink', '.highlight' ],

69 } ),

70 'sphinx_rtd_theme': __.immut.Dictionary( {

71 'anchor_elements': [ 'dt', 'span', 'a' ],

72 'content_strategies': __.immut.Dictionary( {

73 'dt': __.immut.Dictionary( {

74 'description_source': 'next_sibling',

75 'description_element': 'dd',

76 } ),

77 'span': __.immut.Dictionary( {

78 'description_source': 'parent_content',

79 'description_element': 'p',

80 } ),

81 } ),

82 'cleanup_selectors': [ 'a.headerlink' ],

83 } ),

84} )

86# Generic fallback pattern for unknown themes

87_GENERIC_PATTERN = __.immut.Dictionary( {

88 'anchor_elements': [ 'dt', 'span', 'a', 'section', 'div' ],

89 'content_strategies': __.immut.Dictionary( {

90 'dt': __.immut.Dictionary( {

91 'description_source': 'next_sibling',

92 'description_element': 'dd',

93 } ),

94 'section': __.immut.Dictionary( {

95 'description_source': 'first_paragraph',

96 'description_element': 'p',

97 } ),

98 'span': __.immut.Dictionary( {

99 'description_source': 'parent_next_element',

100 'description_element': 'p',

101 } ),

102 'a': __.immut.Dictionary( {

103 'description_source': 'parent_next_element',

104 'description_element': 'p',

105 } ),

106 } ),

107 'cleanup_selectors': [ 'a.headerlink' ],

108} )

109

110

111async def extract_contents(

112 auxdata: __.ApplicationGlobals,

113 source: str,

114 objects: __.cabc.Sequence[ __.InventoryObject ], /, *,

115 theme: __.Absential[ str ] = __.absent,

116) -> list[ __.ContentDocument ]:

117 ''' Extracts documentation content for specified objects. '''

118 base_url = _urls.normalize_base_url( source )

119 if not objects: return [ ]

120 tasks = [

121 _extract_object_documentation(

122 auxdata, base_url, obj, theme )

123 for obj in objects ]

124 candidate_results = await __.asyncf.gather_async(

125 *tasks, return_exceptions = True )

126 results: list[ __.ContentDocument ] = [

127 result.value for result in candidate_results

128 if __.generics.is_value( result ) and result.value is not None ]

129 return results

130

131

132def parse_documentation_html(

133 content: str, element_id: str, url: str, *,

134 theme: __.Absential[ str ] = __.absent

135) -> __.cabc.Mapping[ str, str ]:

136 ''' Parses HTML content to extract documentation sections. '''

137 try: soup = _BeautifulSoup( content, 'lxml' )

138 except Exception as exc:

139 raise __.DocumentationParseFailure(

140 element_id, exc ) from exc

141 # Theme should be provided from detection metadata

142 # If absent, use None to fall back to generic detection

143 container = _find_main_content_container( soup, theme )

144 if __.is_absent( container ):

145 raise __.DocumentationContentAbsence( element_id )

146 element = container.find( id = element_id )

147 if not element:

148 raise __.DocumentationObjectAbsence( element_id, url )

149 description = _extract_content_with_dsl(

150 element, element_id, theme )

151 return {

152 'description': description,

153 'object_name': element_id,

154 }

155

156

157def _cleanup_content(

158 content: str,

159 cleanup_selectors: __.cabc.Sequence[ str ]

160) -> str:

161 ''' Removes unwanted elements from content using CSS selectors. '''

162 # TODO: Implement CSS selector-based cleanup

163 return content

164

165

166def _extract_content_with_dsl(

167 element: __.typx.Any,

168 element_id: str,

169 theme: __.Absential[ str ] = __.absent

170) -> str:

171 ''' Extracts content using DSL pattern configuration. '''

172 theme_name = theme if not __.is_absent( theme ) else None

173 if theme_name is not None:

174 pattern = THEME_EXTRACTION_PATTERNS.get( theme_name, _GENERIC_PATTERN )

175 else: pattern = _GENERIC_PATTERN

176 content_strategies = __.typx.cast(

177 __.cabc.Mapping[ str, __.cabc.Mapping[ str, __.typx.Any ] ],

178 pattern[ 'content_strategies' ] )

179 strategy = content_strategies.get( element.name )

180 if not strategy: return _generic_extraction( element )

181 description = _extract_description_with_strategy( element, strategy )

182 if 'cleanup_selectors' in pattern:

183 cleanup_selectors = __.typx.cast(

184 __.cabc.Sequence[ str ], pattern[ 'cleanup_selectors' ] )

185 description = _cleanup_content( description, cleanup_selectors )

186 return description

187

188

189def _extract_description_with_strategy(

190 element: __.typx.Any,

191 strategy: __.cabc.Mapping[ str, __.typx.Any ]

192) -> str:

193 ''' Extracts description using DSL strategy. '''

194 source_type = __.typx.cast( str, strategy[ 'description_source' ] )

195 element_type = __.typx.cast(

196 str, strategy.get( 'description_element', 'p' ) )

197 return _get_description_by_source_type(

198 element, source_type, element_type )

199

200

201async def _extract_object_documentation(

202 auxdata: __.ApplicationGlobals,

203 base_url: __.typx.Any,

204 obj: __.InventoryObject,

205 theme: __.Absential[ str ] = __.absent,

206) -> __.ContentDocument | None:

207 ''' Extracts documentation for a single object. '''

208 from . import conversion as _conversion

209 doc_url = _urls.derive_documentation_url(

210 base_url, obj.uri, obj.name )

211 try:

212 html_content = (

213 await __.retrieve_url_as_text(

214 auxdata.content_cache, doc_url ) )

215 except Exception as exc:

216 _scribe.debug( "Failed to retrieve %s: %s", doc_url, exc )

217 return None

218 anchor = doc_url.fragment or str( obj.name )

219 try:

220 parsed_content = parse_documentation_html(

221 html_content, anchor, str( doc_url ), theme = theme )

222 except Exception: return None

223 description = _conversion.html_to_markdown(

224 parsed_content[ 'description' ] )

225 return __.ContentDocument(

226 inventory_object = obj,

227 description = description,

228 documentation_url = doc_url.geturl( ),

229 extraction_metadata = __.immut.Dictionary( {

230 'theme': theme if not __.is_absent( theme ) else 'unknown',

231 'extraction_method': 'sphinx_html_parsing',

232 'relevance_score': 1.0,

233 'match_reasons': [ 'direct extraction' ],

234 } )

235 )

240def _find_main_content_container(

241 soup: __.typx.Any, theme: __.Absential[ str ] = __.absent

242) -> __.Absential[ __.typx.Any ]:

243 ''' Finds the main content container using theme-specific strategies. '''

244 if theme == 'furo':

245 containers = [

246 soup.find( 'article', { 'role': 'main' } ),

247 soup.find( 'div', { 'id': 'furo-main-content' } ),

248 ]

249 elif theme == 'sphinx_rtd_theme':

250 containers = [

251 soup.find( 'div', { 'class': 'document' } ),

252 soup.find( 'div', { 'class': 'body' } ),

253 soup.find( 'div', { 'role': 'main' } ),

254 ]

255 elif theme == 'pydoctheme': # Python docs

256 containers = [

257 soup.find( 'div', { 'class': 'body' } ),

258 soup.find( 'div', { 'class': 'content' } ),

259 soup.body, # Python docs often use body directly

260 ]

261 elif theme == 'flask': # Flask docs

262 containers = [

263 soup.find( 'div', { 'class': 'body' } ),

264 soup.find( 'div', { 'class': 'content' } ),

265 soup.body,

266 ]

267 elif theme == 'alabaster':

268 containers = [

269 soup.find( 'div', { 'class': 'body' } ),

270 soup.find( 'div', { 'class': 'content' } ),

271 ]

272 else: # Generic fallback for unknown themes

273 containers = [

274 soup.find( 'article', { 'role': 'main' } ), # Furo theme

275 soup.find( 'div', { 'class': 'body' } ), # Basic theme

276 soup.find( 'div', { 'class': 'content' } ), # Nature theme

277 soup.find( 'div', { 'class': 'main' } ), # Generic main

278 soup.find( 'main' ), # HTML5 main element

279 soup.find( 'div', { 'role': 'main' } ), # Role-based

280 soup.body, # Fallback to body if nothing else works

281 ]

282 for container in containers:

283 if container: return container

284 return __.absent

289def _generic_extraction( element: __.typx.Any ) -> str:

290 ''' Generic fallback extraction for unknown element types. '''

291 description = ''

292 if element.parent:

293 next_p = element.parent.find( 'p' )

294 if next_p:

295 description = str( next_p )

296 return description

297

298

299def _get_description_by_source_type(

300 element: __.typx.Any,

301 source_type: str,

302 element_type: str

303) -> str:

304 ''' Gets description content based on source type. '''

305 match source_type:

306 case 'next_sibling':

307 return _get_sibling_text( element, element_type )

308 case 'parent_next_sibling':

309 return _get_parent_sibling_text( element, element_type )

310 case 'parent_next_element':

311 return _get_parent_element_text( element, element_type )

312 case 'parent_content':

313 return _get_parent_content_text( element, element_type )

314 case 'first_paragraph':

315 return _get_first_paragraph_text( element )

316 case _: return ''

317

318

319def _get_first_paragraph_text( element: __.typx.Any ) -> str:

320 ''' Gets HTML content from first paragraph within element. '''

321 paragraph = element.find( 'p' )

322 return str( paragraph ) if paragraph else ''

323

324

325def _get_parent_content_text( element: __.typx.Any, element_type: str ) -> str:

326 ''' Gets HTML content from content element within parent. '''

327 if element.parent:

328 content_elem = element.parent.find( element_type )

329 return content_elem.decode_contents( ) if content_elem else ''

330 return ''

331

332

333def _get_parent_element_text( element: __.typx.Any, element_type: str ) -> str:

334 ''' Gets HTML content from element within parent. '''

335 if element.parent:

336 next_elem = element.parent.find( element_type )

337 return next_elem.decode_contents( ) if next_elem else ''

338 return ''

339

340

341def _get_parent_sibling_text( element: __.typx.Any, element_type: str ) -> str:

342 ''' Gets HTML content from parent's next sibling element. '''

343 if element.parent:

344 sibling = element.parent.find_next_sibling( element_type )

345 return sibling.decode_contents( ) if sibling else ''

346 return ''

347

348

349def _get_sibling_text( element: __.typx.Any, element_type: str ) -> str:

350 ''' Gets HTML content from next sibling element. '''

351 sibling = element.find_next_sibling( element_type )

352 return sibling.decode_contents( ) if sibling else ''