Coverage for sources/librovore/structures/sphinx/extraction.py: 12%

1# vim: set filetype=python fileencoding=utf-8:

2# -*- coding: utf-8 -*-

4#============================================================================#

5# #

6# Licensed under the Apache License, Version 2.0 (the "License"); #

7# you may not use this file except in compliance with the License. #

8# You may obtain a copy of the License at #

9# #

10# http://www.apache.org/licenses/LICENSE-2.0 #

11# #

12# Unless required by applicable law or agreed to in writing, software #

13# distributed under the License is distributed on an "AS IS" BASIS, #

14# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. #

15# See the License for the specific language governing permissions and #

16# limitations under the License. #

17# #

18#============================================================================#

21''' Documentation extraction and content retrieval. '''

24from bs4 import BeautifulSoup as _BeautifulSoup

26from . import __

27from . import urls as _urls

30_scribe = __.acquire_scribe( __name__ )

33# Theme-specific content extraction patterns

34THEME_EXTRACTION_PATTERNS: __.cabc.Mapping[

35 str, __.cabc.Mapping[ str, __.typx.Any ]

36] = __.immut.Dictionary( {

37 'pydoctheme': __.immut.Dictionary( {

38 'anchor_elements': [ 'dt', 'a' ],

39 'content_strategies': __.immut.Dictionary( {

40 'dt': __.immut.Dictionary( {

41 'description_source': 'next_sibling',

42 'description_element': 'dd',

43 } ),

44 'a': __.immut.Dictionary( {

45 'description_source': 'parent_next_sibling',

46 'description_element': 'dd',

47 } ),

48 } ),

49 'cleanup_selectors': [ 'a.headerlink' ],

50 } ),

51 'furo': __.immut.Dictionary( {

52 'anchor_elements': [ 'span', 'a', 'dt' ],

53 'content_strategies': __.immut.Dictionary( {

54 'span': __.immut.Dictionary( {

55 'description_source': 'parent_next_element',

56 'description_element': 'p',

57 'fallback_container': 'section',

58 } ),

59 'a': __.immut.Dictionary( {

60 'description_source': 'parent_next_element',

61 'description_element': 'p',

62 } ),

63 'dt': __.immut.Dictionary( {

64 'description_source': 'next_sibling',

65 'description_element': 'dd',

66 } ),

67 } ),

68 'cleanup_selectors': [ 'a.headerlink', '.highlight' ],

69 } ),

70 'sphinx_rtd_theme': __.immut.Dictionary( {

71 'anchor_elements': [ 'dt', 'span', 'a' ],

72 'content_strategies': __.immut.Dictionary( {

73 'dt': __.immut.Dictionary( {

74 'description_source': 'next_sibling',

75 'description_element': 'dd',

76 } ),

77 'span': __.immut.Dictionary( {

78 'description_source': 'parent_content',

79 'description_element': 'p',

80 } ),

81 } ),

82 'cleanup_selectors': [ 'a.headerlink' ],

83 } ),

84} )

86# Generic fallback pattern for unknown themes

87_GENERIC_PATTERN = __.immut.Dictionary( {

88 'anchor_elements': [ 'dt', 'span', 'a', 'section', 'div' ],

89 'content_strategies': __.immut.Dictionary( {

90 'dt': __.immut.Dictionary( {

91 'description_source': 'next_sibling',

92 'description_element': 'dd',

93 } ),

94 'section': __.immut.Dictionary( {

95 'description_source': 'first_paragraph',

96 'description_element': 'p',

97 } ),

98 'span': __.immut.Dictionary( {

99 'description_source': 'parent_next_element',

100 'description_element': 'p',

101 } ),

102 'a': __.immut.Dictionary( {

103 'description_source': 'parent_next_element',

104 'description_element': 'p',

105 } ),

106 } ),

107 'cleanup_selectors': [ 'a.headerlink' ],

108} )

109

110

111async def extract_contents(

112 auxdata: __.ApplicationGlobals,

113 source: str,

114 objects: __.cabc.Sequence[ __.InventoryObject ], /, *,

115 theme: __.Absential[ str ] = __.absent,

116) -> list[ __.ContentDocument ]:

117 ''' Extracts documentation content for specified objects. '''

118 base_url = _urls.normalize_base_url( source )

119 if not objects: return [ ]

120 tasks = [

121 _extract_object_documentation(

122 auxdata, base_url, source, obj, theme )

123 for obj in objects ]

124 candidate_results = await __.asyncf.gather_async(

125 *tasks, return_exceptions = True )

126 results: list[ __.ContentDocument ] = [

127 result.value for result in candidate_results

128 if __.generics.is_value( result ) and result.value is not None ]

129 return results

130

131

132def parse_documentation_html(

133 content: str, element_id: str, url: str, *,

134 theme: __.Absential[ str ] = __.absent

135) -> __.cabc.Mapping[ str, str ]:

136 ''' Parses HTML content to extract documentation sections. '''

137 try: soup = _BeautifulSoup( content, 'lxml' )

138 except Exception as exc:

139 raise __.DocumentationParseFailure(

140 element_id, exc ) from exc

141 # Theme should be provided from detection metadata

142 # If absent, use None to fall back to generic detection

143 container = _find_main_content_container( soup, theme )

144 if __.is_absent( container ):

145 raise __.DocumentationContentAbsence( element_id )

146 element = container.find( id = element_id )

147 if not element:

148 raise __.DocumentationObjectAbsence( element_id, url )

149 description = _extract_content_with_dsl(

150 element, element_id, theme )

151 return {

152 'description': description,

153 'object_name': element_id,

154 }

155

156

157def _cleanup_content(

158 content: str,

159 cleanup_selectors: __.cabc.Sequence[ str ]

160) -> str:

161 ''' Removes unwanted elements from content using CSS selectors. '''

162 # TODO: Implement CSS selector-based cleanup

163 return content

164

165

166def _extract_content_with_dsl(

167 element: __.typx.Any,

168 element_id: str,

169 theme: __.Absential[ str ] = __.absent

170) -> str:

171 ''' Extracts content using DSL pattern configuration. '''

172 theme_name = theme if not __.is_absent( theme ) else None

173 if theme_name is not None:

174 pattern = THEME_EXTRACTION_PATTERNS.get( theme_name, _GENERIC_PATTERN )

175 else: pattern = _GENERIC_PATTERN

176 content_strategies = __.typx.cast(

177 __.cabc.Mapping[ str, __.cabc.Mapping[ str, __.typx.Any ] ],

178 pattern[ 'content_strategies' ] )

179 strategy = content_strategies.get( element.name )

180 if not strategy: return _generic_extraction( element )

181 description = _extract_description_with_strategy( element, strategy )

182 if 'cleanup_selectors' in pattern:

183 cleanup_selectors = __.typx.cast(

184 __.cabc.Sequence[ str ], pattern[ 'cleanup_selectors' ] )

185 description = _cleanup_content( description, cleanup_selectors )

186 return description

187

188

189def _extract_description_with_strategy(

190 element: __.typx.Any,

191 strategy: __.cabc.Mapping[ str, __.typx.Any ]

192) -> str:

193 ''' Extracts description using DSL strategy. '''

194 source_type = __.typx.cast( str, strategy[ 'description_source' ] )

195 element_type = __.typx.cast(

196 str, strategy.get( 'description_element', 'p' ) )

197 return _get_description_by_source_type(

198 element, source_type, element_type )

199

200

201async def _extract_object_documentation(

202 auxdata: __.ApplicationGlobals,

203 base_url: __.typx.Any,

204 location: str,

205 obj: __.InventoryObject,

206 theme: __.Absential[ str ] = __.absent,

207) -> __.ContentDocument | None:

208 ''' Extracts documentation for a single object. '''

209 from . import conversion as _conversion

210 doc_url = _urls.derive_documentation_url(

211 base_url, obj.uri, obj.name )

212 try:

213 html_content = (

214 await __.retrieve_url_as_text(

215 auxdata.content_cache, doc_url ) )

216 except Exception as exc:

217 _scribe.debug( "Failed to retrieve %s: %s", doc_url, exc )

218 return None

219 anchor = doc_url.fragment or str( obj.name )

220 try:

221 parsed_content = parse_documentation_html(

222 html_content, anchor, str( doc_url ), theme = theme )

223 except Exception: return None

224 description = _conversion.html_to_markdown(

225 parsed_content[ 'description' ] )

226 content_id = __.produce_content_id( location, obj.name )

227 return __.ContentDocument(

228 inventory_object = obj,

229 content_id = content_id,

230 description = description,

231 documentation_url = doc_url.geturl( ),

232 extraction_metadata = __.immut.Dictionary( {

233 'theme': theme if not __.is_absent( theme ) else 'unknown',

234 'extraction_method': 'sphinx_html_parsing',

235 'relevance_score': 1.0,

236 'match_reasons': [ 'direct extraction' ],

237 } )

238 )

243def _find_main_content_container(

244 soup: __.typx.Any, theme: __.Absential[ str ] = __.absent

245) -> __.Absential[ __.typx.Any ]:

246 ''' Finds the main content container using theme-specific strategies. '''

247 if theme == 'furo':

248 containers = [

249 soup.find( 'article', { 'role': 'main' } ),

250 soup.find( 'div', { 'id': 'furo-main-content' } ),

251 ]

252 elif theme == 'sphinx_rtd_theme':

253 containers = [

254 soup.find( 'div', { 'class': 'document' } ),

255 soup.find( 'div', { 'class': 'body' } ),

256 soup.find( 'div', { 'role': 'main' } ),

257 ]

258 elif theme == 'pydoctheme': # Python docs

259 containers = [

260 soup.find( 'div', { 'class': 'body' } ),

261 soup.find( 'div', { 'class': 'content' } ),

262 soup.body, # Python docs often use body directly

263 ]

264 elif theme == 'flask': # Flask docs

265 containers = [

266 soup.find( 'div', { 'class': 'body' } ),

267 soup.find( 'div', { 'class': 'content' } ),

268 soup.body,

269 ]

270 elif theme == 'alabaster':

271 containers = [

272 soup.find( 'div', { 'class': 'body' } ),

273 soup.find( 'div', { 'class': 'content' } ),

274 ]

275 else: # Generic fallback for unknown themes

276 containers = [

277 soup.find( 'article', { 'role': 'main' } ), # Furo theme

278 soup.find( 'div', { 'class': 'body' } ), # Basic theme

279 soup.find( 'div', { 'class': 'content' } ), # Nature theme

280 soup.find( 'div', { 'class': 'main' } ), # Generic main

281 soup.find( 'main' ), # HTML5 main element

282 soup.find( 'div', { 'role': 'main' } ), # Role-based

283 soup.body, # Fallback to body if nothing else works

284 ]

285 for container in containers:

286 if container: return container

287 return __.absent

292def _generic_extraction( element: __.typx.Any ) -> str:

293 ''' Generic fallback extraction for unknown element types. '''

294 description = ''

295 if element.parent:

296 next_p = element.parent.find( 'p' )

297 if next_p:

298 description = str( next_p )

299 return description

300

301

302def _get_description_by_source_type(

303 element: __.typx.Any,

304 source_type: str,

305 element_type: str

306) -> str:

307 ''' Gets description content based on source type. '''

308 match source_type:

309 case 'next_sibling':

310 return _get_sibling_text( element, element_type )

311 case 'parent_next_sibling':

312 return _get_parent_sibling_text( element, element_type )

313 case 'parent_next_element':

314 return _get_parent_element_text( element, element_type )

315 case 'parent_content':

316 return _get_parent_content_text( element, element_type )

317 case 'first_paragraph':

318 return _get_first_paragraph_text( element )

319 case _: return ''

320

321

322def _get_first_paragraph_text( element: __.typx.Any ) -> str:

323 ''' Gets HTML content from first paragraph within element. '''

324 paragraph = element.find( 'p' )

325 return str( paragraph ) if paragraph else ''

326

327

328def _get_parent_content_text( element: __.typx.Any, element_type: str ) -> str:

329 ''' Gets HTML content from content element within parent. '''

330 if element.parent:

331 content_elem = element.parent.find( element_type )

332 return content_elem.decode_contents( ) if content_elem else ''

333 return ''

334

335

336def _get_parent_element_text( element: __.typx.Any, element_type: str ) -> str:

337 ''' Gets HTML content from element within parent. '''

338 if element.parent:

339 next_elem = element.parent.find( element_type )

340 return next_elem.decode_contents( ) if next_elem else ''

341 return ''

342

343

344def _get_parent_sibling_text( element: __.typx.Any, element_type: str ) -> str:

345 ''' Gets HTML content from parent's next sibling element. '''

346 if element.parent:

347 sibling = element.parent.find_next_sibling( element_type )

348 return sibling.decode_contents( ) if sibling else ''

349 return ''

350

351

352def _get_sibling_text( element: __.typx.Any, element_type: str ) -> str:

353 ''' Gets HTML content from next sibling element. '''

354 sibling = element.find_next_sibling( element_type )

355 return sibling.decode_contents( ) if sibling else ''