Coverage for sources/librovore/structures/sphinx/extraction.py: 11%

1# vim: set filetype=python fileencoding=utf-8:

2# -*- coding: utf-8 -*-

4#============================================================================#

5# #

6# Licensed under the Apache License, Version 2.0 (the "License"); #

7# you may not use this file except in compliance with the License. #

8# You may obtain a copy of the License at #

9# #

10# http://www.apache.org/licenses/LICENSE-2.0 #

11# #

12# Unless required by applicable law or agreed to in writing, software #

13# distributed under the License is distributed on an "AS IS" BASIS, #

14# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. #

15# See the License for the specific language governing permissions and #

16# limitations under the License. #

17# #

18#============================================================================#

21''' Documentation extraction and content retrieval. '''

24from bs4 import BeautifulSoup as _BeautifulSoup

26from . import __

27from . import urls as _urls

30_scribe = __.acquire_scribe( __name__ )

33# Theme-specific content extraction patterns

34THEME_EXTRACTION_PATTERNS: __.cabc.Mapping[

35 str, __.cabc.Mapping[ str, __.typx.Any ]

36] = __.immut.Dictionary( {

37 'pydoctheme': __.immut.Dictionary( {

38 'anchor_elements': [ 'dt', 'a' ],

39 'content_strategies': __.immut.Dictionary( {

40 'dt': __.immut.Dictionary( {

41 'signature_source': 'self',

42 'description_source': 'next_sibling',

43 'description_element': 'dd',

44 } ),

45 'a': __.immut.Dictionary( {

46 'signature_source': 'parent_text',

47 'description_source': 'parent_next_sibling',

48 'description_element': 'dd',

49 } ),

50 } ),

51 'cleanup_selectors': [ 'a.headerlink' ],

52 } ),

53 'furo': __.immut.Dictionary( {

54 'anchor_elements': [ 'span', 'a', 'dt' ],

55 'content_strategies': __.immut.Dictionary( {

56 'span': __.immut.Dictionary( {

57 'signature_source': 'parent_header',

58 'description_source': 'parent_next_element',

59 'description_element': 'p',

60 'fallback_container': 'section',

61 } ),

62 'a': __.immut.Dictionary( {

63 'signature_source': 'parent_text',

64 'description_source': 'parent_next_element',

65 'description_element': 'p',

66 } ),

67 'dt': __.immut.Dictionary( {

68 'signature_source': 'self',

69 'description_source': 'next_sibling',

70 'description_element': 'dd',

71 } ),

72 } ),

73 'cleanup_selectors': [ 'a.headerlink', '.highlight' ],

74 } ),

75 'sphinx_rtd_theme': __.immut.Dictionary( {

76 'anchor_elements': [ 'dt', 'span', 'a' ],

77 'content_strategies': __.immut.Dictionary( {

78 'dt': __.immut.Dictionary( {

79 'signature_source': 'self',

80 'description_source': 'next_sibling',

81 'description_element': 'dd',

82 } ),

83 'span': __.immut.Dictionary( {

84 'signature_source': 'parent_header',

85 'description_source': 'parent_content',

86 'description_element': 'p',

87 } ),

88 } ),

89 'cleanup_selectors': [ 'a.headerlink' ],

90 } ),

91} )

93# Generic fallback pattern for unknown themes

94_GENERIC_PATTERN = __.immut.Dictionary( {

95 'anchor_elements': [ 'dt', 'span', 'a', 'section', 'div' ],

96 'content_strategies': __.immut.Dictionary( {

97 'dt': __.immut.Dictionary( {

98 'signature_source': 'self',

99 'description_source': 'next_sibling',

100 'description_element': 'dd',

101 } ),

102 'section': __.immut.Dictionary( {

103 'signature_source': 'first_header',

104 'description_source': 'first_paragraph',

105 'description_element': 'p',

106 } ),

107 'span': __.immut.Dictionary( {

108 'signature_source': 'parent_header',

109 'description_source': 'parent_next_element',

110 'description_element': 'p',

111 } ),

112 'a': __.immut.Dictionary( {

113 'signature_source': 'parent_text',

114 'description_source': 'parent_next_element',

115 'description_element': 'p',

116 } ),

117 } ),

118 'cleanup_selectors': [ 'a.headerlink' ],

119} )

120

121

122async def extract_contents(

123 auxdata: __.ApplicationGlobals,

124 source: str,

125 objects: __.cabc.Sequence[ __.InventoryObject ], /, *,

126 theme: __.Absential[ str ] = __.absent,

127 include_snippets: bool = True,

128) -> list[ __.ContentDocument ]:

129 ''' Extracts documentation content for specified objects. '''

130 base_url = _urls.normalize_base_url( source )

131 if not objects: return [ ]

132 tasks = [

133 _extract_object_documentation(

134 auxdata, base_url, obj, include_snippets, theme )

135 for obj in objects ]

136 candidate_results = await __.asyncf.gather_async(

137 *tasks, return_exceptions = True )

138 results: list[ __.ContentDocument ] = [

139 result.value for result in candidate_results

140 if __.generics.is_value( result ) and result.value is not None ]

141 return results

142

143

144def parse_documentation_html(

145 content: str, element_id: str, url: str, *,

146 theme: __.Absential[ str ] = __.absent

147) -> __.cabc.Mapping[ str, str ]:

148 ''' Parses HTML content to extract documentation sections. '''

149 try: soup = _BeautifulSoup( content, 'lxml' )

150 except Exception as exc:

151 raise __.DocumentationParseFailure(

152 element_id, exc ) from exc

153 # Theme should be provided from detection metadata

154 # If absent, use None to fall back to generic detection

155 container = _find_main_content_container( soup, theme )

156 if __.is_absent( container ):

157 raise __.DocumentationContentAbsence( element_id )

158 element = container.find( id = element_id )

159 if not element:

160 raise __.DocumentationObjectAbsence( element_id, url )

161 signature, description = _extract_content_with_dsl(

162 element, element_id, theme )

163 return {

164 'signature': signature,

165 'description': description,

166 'object_name': element_id,

167 }

168

169

170def _cleanup_content(

171 content: str,

172 cleanup_selectors: __.cabc.Sequence[ str ]

173) -> str:

174 ''' Removes unwanted elements from content using CSS selectors. '''

175 # TODO: Implement CSS selector-based cleanup

176 return content

177

178

179def _extract_content_with_dsl(

180 element: __.typx.Any,

181 element_id: str,

182 theme: __.Absential[ str ] = __.absent

183) -> tuple[ str, str ]:

184 ''' Extracts content using DSL pattern configuration. '''

185 theme_name = theme if not __.is_absent( theme ) else None

186 if theme_name is not None:

187 pattern = THEME_EXTRACTION_PATTERNS.get( theme_name, _GENERIC_PATTERN )

188 else: pattern = _GENERIC_PATTERN

189 content_strategies = __.typx.cast(

190 __.cabc.Mapping[ str, __.cabc.Mapping[ str, __.typx.Any ] ],

191 pattern[ 'content_strategies' ] )

192 strategy = content_strategies.get( element.name )

193 if not strategy: return _generic_extraction( element )

194 signature = _extract_signature_with_strategy( element, strategy )

195 description = _extract_description_with_strategy( element, strategy )

196 if 'cleanup_selectors' in pattern:

197 cleanup_selectors = __.typx.cast(

198 __.cabc.Sequence[ str ], pattern[ 'cleanup_selectors' ] )

199 description = _cleanup_content( description, cleanup_selectors )

200 return signature, description

201

202

203def _extract_description_with_strategy(

204 element: __.typx.Any,

205 strategy: __.cabc.Mapping[ str, __.typx.Any ]

206) -> str:

207 ''' Extracts description using DSL strategy. '''

208 source_type = __.typx.cast( str, strategy[ 'description_source' ] )

209 element_type = __.typx.cast(

210 str, strategy.get( 'description_element', 'p' ) )

211 return _get_description_by_source_type(

212 element, source_type, element_type )

213

214

215async def _extract_object_documentation(

216 auxdata: __.ApplicationGlobals,

217 base_url: __.typx.Any,

218 obj: __.InventoryObject,

219 include_snippets: bool,

220 theme: __.Absential[ str ] = __.absent

221) -> __.ContentDocument | None:

222 ''' Extracts documentation for a single object. '''

223 from . import conversion as _conversion

224 doc_url = _urls.derive_documentation_url(

225 base_url, obj.uri, obj.name )

226 try:

227 html_content = (

228 await __.retrieve_url_as_text(

229 auxdata.content_cache, doc_url ) )

230 except Exception as exc:

231 _scribe.debug( "Failed to retrieve %s: %s", doc_url, exc )

232 return None

233 anchor = doc_url.fragment or str( obj.name )

234 try:

235 parsed_content = parse_documentation_html(

236 html_content, anchor, str( doc_url ), theme = theme )

237 except Exception: return None

238 description = _conversion.html_to_markdown(

239 parsed_content[ 'description' ] )

240 snippet_max_length = 200

241 if include_snippets:

242 content_snippet = (

243 description[ : snippet_max_length ] + '...'

244 if len( description ) > snippet_max_length

245 else description )

246 else: content_snippet = ''

247 return __.ContentDocument(

248 inventory_object = obj,

249 signature = parsed_content[ 'signature' ],

250 description = description,

251 content_snippet = content_snippet,

252 documentation_url = doc_url.geturl( ),

253 extraction_metadata = __.immut.Dictionary( {

254 'theme': theme if not __.is_absent( theme ) else 'unknown',

255 'extraction_method': 'sphinx_html_parsing',

256 'relevance_score': 1.0,

257 'match_reasons': [ 'direct extraction' ],

258 } )

259 )

260

261

262def _extract_signature_with_strategy(

263 element: __.typx.Any,

264 strategy: __.cabc.Mapping[ str, __.typx.Any ]

265) -> str:

266 ''' Extracts signature using DSL strategy. '''

267 source_type = __.typx.cast( str, strategy[ 'signature_source' ] )

268 match source_type:

269 case 'self': return _clean_extracted_text( element.get_text( ) )

270 case 'parent_text':

271 return (

272 _clean_extracted_text( element.parent.get_text( ) )

273 if element.parent else '' )

274 case 'parent_header':

275 if element.parent:

276 header = element.parent.find(

277 [ 'h1', 'h2', 'h3', 'h4', 'h5', 'h6' ] )

278 return (

279 _clean_extracted_text( header.get_text( ) )

280 if header else '' )

281 return ''

282 case 'first_header':

283 header = element.find(

284 [ 'h1', 'h2', 'h3', 'h4', 'h5', 'h6' ] )

285 return (

286 _clean_extracted_text( header.get_text( ) )

287 if header else '' )

288 case _: return _clean_extracted_text( element.get_text( ) )

289

290

291def _find_main_content_container(

292 soup: __.typx.Any, theme: __.Absential[ str ] = __.absent

293) -> __.Absential[ __.typx.Any ]:

294 ''' Finds the main content container using theme-specific strategies. '''

295 if theme == 'furo':

296 containers = [

297 soup.find( 'article', { 'role': 'main' } ),

298 soup.find( 'div', { 'id': 'furo-main-content' } ),

299 ]

300 elif theme == 'sphinx_rtd_theme':

301 containers = [

302 soup.find( 'div', { 'class': 'document' } ),

303 soup.find( 'div', { 'class': 'body' } ),

304 soup.find( 'div', { 'role': 'main' } ),

305 ]

306 elif theme == 'pydoctheme': # Python docs

307 containers = [

308 soup.find( 'div', { 'class': 'body' } ),

309 soup.find( 'div', { 'class': 'content' } ),

310 soup.body, # Python docs often use body directly

311 ]

312 elif theme == 'flask': # Flask docs

313 containers = [

314 soup.find( 'div', { 'class': 'body' } ),

315 soup.find( 'div', { 'class': 'content' } ),

316 soup.body,

317 ]

318 elif theme == 'alabaster':

319 containers = [

320 soup.find( 'div', { 'class': 'body' } ),

321 soup.find( 'div', { 'class': 'content' } ),

322 ]

323 else: # Generic fallback for unknown themes

324 containers = [

325 soup.find( 'article', { 'role': 'main' } ), # Furo theme

326 soup.find( 'div', { 'class': 'body' } ), # Basic theme

327 soup.find( 'div', { 'class': 'content' } ), # Nature theme

328 soup.find( 'div', { 'class': 'main' } ), # Generic main

329 soup.find( 'main' ), # HTML5 main element

330 soup.find( 'div', { 'role': 'main' } ), # Role-based

331 soup.body, # Fallback to body if nothing else works

332 ]

333 for container in containers:

334 if container: return container

335 return __.absent

336

337

338def _clean_extracted_text( text: str ) -> str:

339 ''' Cleans extracted text while preserving internal spacing. '''

340 # Remove leading/trailing whitespace but preserve internal spaces

341 text = text.strip( )

342 # Normalize multiple spaces to single spaces

343 text = __.re.sub( r' +', ' ', text )

344 # Remove excessive newlines but preserve paragraph breaks

345 return __.re.sub( r'\n\s*\n', '\n\n', text )

346

347

348def _generic_extraction( element: __.typx.Any ) -> tuple[ str, str ]:

349 ''' Generic fallback extraction for unknown element types. '''

350 signature = _clean_extracted_text( element.get_text( ) )

351 description = ''

352 if element.parent:

353 next_p = element.parent.find( 'p' )

354 if next_p:

355 description = str( next_p )

356 return signature, description

357

358

359def _get_description_by_source_type(

360 element: __.typx.Any,

361 source_type: str,

362 element_type: str

363) -> str:

364 ''' Gets description content based on source type. '''

365 match source_type:

366 case 'next_sibling':

367 return _get_sibling_text( element, element_type )

368 case 'parent_next_sibling':

369 return _get_parent_sibling_text( element, element_type )

370 case 'parent_next_element':

371 return _get_parent_element_text( element, element_type )

372 case 'parent_content':

373 return _get_parent_content_text( element, element_type )

374 case 'first_paragraph':

375 return _get_first_paragraph_text( element )

376 case _: return ''

377

378

379def _get_first_paragraph_text( element: __.typx.Any ) -> str:

380 ''' Gets HTML content from first paragraph within element. '''

381 paragraph = element.find( 'p' )

382 return str( paragraph ) if paragraph else ''

383

384

385def _get_parent_content_text( element: __.typx.Any, element_type: str ) -> str:

386 ''' Gets HTML content from content element within parent. '''

387 if element.parent:

388 content_elem = element.parent.find( element_type )

389 return content_elem.decode_contents( ) if content_elem else ''

390 return ''

391

392

393def _get_parent_element_text( element: __.typx.Any, element_type: str ) -> str:

394 ''' Gets HTML content from element within parent. '''

395 if element.parent:

396 next_elem = element.parent.find( element_type )

397 return next_elem.decode_contents( ) if next_elem else ''

398 return ''

399

400

401def _get_parent_sibling_text( element: __.typx.Any, element_type: str ) -> str:

402 ''' Gets HTML content from parent's next sibling element. '''

403 if element.parent:

404 sibling = element.parent.find_next_sibling( element_type )

405 return sibling.decode_contents( ) if sibling else ''

406 return ''

407

408

409def _get_sibling_text( element: __.typx.Any, element_type: str ) -> str:

410 ''' Gets HTML content from next sibling element. '''

411 sibling = element.find_next_sibling( element_type )

412 return sibling.decode_contents( ) if sibling else ''