Coverage for sources/librovore/structures/sphinx/extraction.py: 11%

1# vim: set filetype=python fileencoding=utf-8:

2# -*- coding: utf-8 -*-

4#============================================================================#

5# #

6# Licensed under the Apache License, Version 2.0 (the "License"); #

7# you may not use this file except in compliance with the License. #

8# You may obtain a copy of the License at #

9# #

10# http://www.apache.org/licenses/LICENSE-2.0 #

11# #

12# Unless required by applicable law or agreed to in writing, software #

13# distributed under the License is distributed on an "AS IS" BASIS, #

14# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. #

15# See the License for the specific language governing permissions and #

16# limitations under the License. #

17# #

18#============================================================================#

21''' Documentation extraction and content retrieval. '''

24from bs4 import BeautifulSoup as _BeautifulSoup

26from . import __

27from . import urls as _urls

30_scribe = __.acquire_scribe( __name__ )

33# Theme-specific content extraction patterns

34THEME_EXTRACTION_PATTERNS: __.cabc.Mapping[

35 str, __.cabc.Mapping[ str, __.typx.Any ]

36] = __.immut.Dictionary( {

37 'pydoctheme': __.immut.Dictionary( {

38 'anchor_elements': [ 'dt', 'a' ],

39 'content_strategies': __.immut.Dictionary( {

40 'dt': __.immut.Dictionary( {

41 'signature_source': 'self',

42 'description_source': 'next_sibling',

43 'description_element': 'dd',

44 } ),

45 'a': __.immut.Dictionary( {

46 'signature_source': 'parent_text',

47 'description_source': 'parent_next_sibling',

48 'description_element': 'dd',

49 } ),

50 } ),

51 'cleanup_selectors': [ 'a.headerlink' ],

52 } ),

53 'furo': __.immut.Dictionary( {

54 'anchor_elements': [ 'span', 'a', 'dt' ],

55 'content_strategies': __.immut.Dictionary( {

56 'span': __.immut.Dictionary( {

57 'signature_source': 'parent_header',

58 'description_source': 'parent_next_element',

59 'description_element': 'p',

60 'fallback_container': 'section',

61 } ),

62 'a': __.immut.Dictionary( {

63 'signature_source': 'parent_text',

64 'description_source': 'parent_next_element',

65 'description_element': 'p',

66 } ),

67 'dt': __.immut.Dictionary( {

68 'signature_source': 'self',

69 'description_source': 'next_sibling',

70 'description_element': 'dd',

71 } ),

72 } ),

73 'cleanup_selectors': [ 'a.headerlink', '.highlight' ],

74 } ),

75 'sphinx_rtd_theme': __.immut.Dictionary( {

76 'anchor_elements': [ 'dt', 'span', 'a' ],

77 'content_strategies': __.immut.Dictionary( {

78 'dt': __.immut.Dictionary( {

79 'signature_source': 'self',

80 'description_source': 'next_sibling',

81 'description_element': 'dd',

82 } ),

83 'span': __.immut.Dictionary( {

84 'signature_source': 'parent_header',

85 'description_source': 'parent_content',

86 'description_element': 'p',

87 } ),

88 } ),

89 'cleanup_selectors': [ 'a.headerlink' ],

90 } ),

91} )

93# Generic fallback pattern for unknown themes

94_GENERIC_PATTERN = __.immut.Dictionary( {

95 'anchor_elements': [ 'dt', 'span', 'a', 'section', 'div' ],

96 'content_strategies': __.immut.Dictionary( {

97 'dt': __.immut.Dictionary( {

98 'signature_source': 'self',

99 'description_source': 'next_sibling',

100 'description_element': 'dd',

101 } ),

102 'section': __.immut.Dictionary( {

103 'signature_source': 'first_header',

104 'description_source': 'first_paragraph',

105 'description_element': 'p',

106 } ),

107 'span': __.immut.Dictionary( {

108 'signature_source': 'parent_header',

109 'description_source': 'parent_next_element',

110 'description_element': 'p',

111 } ),

112 'a': __.immut.Dictionary( {

113 'signature_source': 'parent_text',

114 'description_source': 'parent_next_element',

115 'description_element': 'p',

116 } ),

117 } ),

118 'cleanup_selectors': [ 'a.headerlink' ],

119} )

120

121

122async def extract_contents(

123 auxdata: __.ApplicationGlobals,

124 source: str,

125 objects: __.cabc.Sequence[ __.cabc.Mapping[ str, __.typx.Any ] ], /, *,

126 theme: __.Absential[ str ] = __.absent,

127 include_snippets: bool = True,

128) -> list[ dict[ str, __.typx.Any ] ]:

129 ''' Extracts documentation content for specified objects. '''

130 base_url = _urls.normalize_base_url( source )

131 if not objects: return [ ]

132 tasks = [

133 _extract_object_documentation(

134 auxdata, base_url, dict( obj ), include_snippets, theme )

135 for obj in objects ]

136 candidate_results = await __.asyncf.gather_async(

137 *tasks, return_exceptions = True )

138 results: list[ dict[ str, __.typx.Any ] ] = [

139 dict( result.value ) for result in candidate_results

140 if __.generics.is_value( result ) and result.value is not None ]

141 return results

142

143

144def parse_documentation_html(

145 content: str, element_id: str, url: str, *,

146 theme: __.Absential[ str ] = __.absent

147) -> __.cabc.Mapping[ str, str ]:

148 ''' Parses HTML content to extract documentation sections. '''

149 try: soup = _BeautifulSoup( content, 'lxml' )

150 except Exception as exc:

151 raise __.DocumentationParseFailure(

152 element_id, exc ) from exc

153 # Theme should be provided from detection metadata

154 # If absent, use None to fall back to generic detection

155 container = _find_main_content_container( soup, theme )

156 if __.is_absent( container ):

157 raise __.DocumentationContentAbsence( element_id )

158 element = container.find( id = element_id )

159 if not element:

160 raise __.DocumentationObjectAbsence( element_id, url )

161 signature, description = _extract_content_with_dsl(

162 element, element_id, theme )

163 return {

164 'signature': signature,

165 'description': description,

166 'object_name': element_id,

167 }

168

169

170def _cleanup_content(

171 content: str,

172 cleanup_selectors: __.cabc.Sequence[ str ]

173) -> str:

174 ''' Removes unwanted elements from content using CSS selectors. '''

175 # TODO: Implement CSS selector-based cleanup

176 return content

177

178

179def _extract_content_with_dsl(

180 element: __.typx.Any,

181 element_id: str,

182 theme: __.Absential[ str ] = __.absent

183) -> tuple[ str, str ]:

184 ''' Extracts content using DSL pattern configuration. '''

185 theme_name = theme if not __.is_absent( theme ) else None

186 if theme_name is not None:

187 pattern = THEME_EXTRACTION_PATTERNS.get( theme_name, _GENERIC_PATTERN )

188 else: pattern = _GENERIC_PATTERN

189 content_strategies = __.typx.cast(

190 __.cabc.Mapping[ str, __.cabc.Mapping[ str, __.typx.Any ] ],

191 pattern[ 'content_strategies' ] )

192 strategy = content_strategies.get( element.name )

193 if not strategy: return _generic_extraction( element )

194 signature = _extract_signature_with_strategy( element, strategy )

195 description = _extract_description_with_strategy( element, strategy )

196 if 'cleanup_selectors' in pattern:

197 cleanup_selectors = __.typx.cast(

198 __.cabc.Sequence[ str ], pattern[ 'cleanup_selectors' ] )

199 description = _cleanup_content( description, cleanup_selectors )

200 return signature, description

201

202

203def _extract_description_with_strategy(

204 element: __.typx.Any,

205 strategy: __.cabc.Mapping[ str, __.typx.Any ]

206) -> str:

207 ''' Extracts description using DSL strategy. '''

208 source_type = __.typx.cast( str, strategy[ 'description_source' ] )

209 element_type = __.typx.cast(

210 str, strategy.get( 'description_element', 'p' ) )

211 return _get_description_by_source_type(

212 element, source_type, element_type )

213

214

215async def _extract_object_documentation(

216 auxdata: __.ApplicationGlobals,

217 base_url: __.typx.Any,

218 obj: dict[ str, __.typx.Any ],

219 include_snippets: bool,

220 theme: __.Absential[ str ] = __.absent

221) -> dict[ str, __.typx.Any ] | None:

222 ''' Extracts documentation for a single object. '''

223 from . import conversion as _conversion

224 doc_url = _urls.derive_documentation_url(

225 base_url, obj[ 'uri' ], obj[ 'name' ] )

226 try:

227 html_content = (

228 await __.retrieve_url_as_text(

229 auxdata.content_cache, doc_url ) )

230 except Exception as exc:

231 _scribe.debug( "Failed to retrieve %s: %s", doc_url, exc )

232 return None

233 anchor = doc_url.fragment or str( obj[ 'name' ] )

234 try:

235 parsed_content = parse_documentation_html(

236 html_content, anchor, str( doc_url ), theme = theme )

237 except Exception: return None

238 description = _conversion.html_to_markdown(

239 parsed_content[ 'description' ] )

240 snippet_max_length = 200

241 if include_snippets:

242 content_snippet = (

243 description[ : snippet_max_length ] + '...'

244 if len( description ) > snippet_max_length

245 else description )

246 else: content_snippet = ''

247 return {

248 'object_name': obj[ 'name' ],

249 'object_type': obj[ 'role' ],

250 'domain': obj[ 'domain' ],

251 'priority': obj[ 'priority' ],

252 'url': doc_url.geturl( ),

253 'signature': parsed_content[ 'signature' ],

254 'description': description,

255 'content_snippet': content_snippet,

256 'relevance_score': 1.0,

257 'match_reasons': [ 'direct extraction' ],

258 }

259

260

261def _extract_signature_with_strategy(

262 element: __.typx.Any,

263 strategy: __.cabc.Mapping[ str, __.typx.Any ]

264) -> str:

265 ''' Extracts signature using DSL strategy. '''

266 source_type = __.typx.cast( str, strategy[ 'signature_source' ] )

267 match source_type:

268 case 'self': return _clean_extracted_text( element.get_text( ) )

269 case 'parent_text':

270 return (

271 _clean_extracted_text( element.parent.get_text( ) )

272 if element.parent else '' )

273 case 'parent_header':

274 if element.parent:

275 header = element.parent.find(

276 [ 'h1', 'h2', 'h3', 'h4', 'h5', 'h6' ] )

277 return (

278 _clean_extracted_text( header.get_text( ) )

279 if header else '' )

280 return ''

281 case 'first_header':

282 header = element.find(

283 [ 'h1', 'h2', 'h3', 'h4', 'h5', 'h6' ] )

284 return (

285 _clean_extracted_text( header.get_text( ) )

286 if header else '' )

287 case _: return _clean_extracted_text( element.get_text( ) )

288

289

290def _find_main_content_container(

291 soup: __.typx.Any, theme: __.Absential[ str ] = __.absent

292) -> __.Absential[ __.typx.Any ]:

293 ''' Finds the main content container using theme-specific strategies. '''

294 if theme == 'furo':

295 containers = [

296 soup.find( 'article', { 'role': 'main' } ),

297 soup.find( 'div', { 'id': 'furo-main-content' } ),

298 ]

299 elif theme == 'sphinx_rtd_theme':

300 containers = [

301 soup.find( 'div', { 'class': 'document' } ),

302 soup.find( 'div', { 'class': 'body' } ),

303 soup.find( 'div', { 'role': 'main' } ),

304 ]

305 elif theme == 'pydoctheme': # Python docs

306 containers = [

307 soup.find( 'div', { 'class': 'body' } ),

308 soup.find( 'div', { 'class': 'content' } ),

309 soup.body, # Python docs often use body directly

310 ]

311 elif theme == 'flask': # Flask docs

312 containers = [

313 soup.find( 'div', { 'class': 'body' } ),

314 soup.find( 'div', { 'class': 'content' } ),

315 soup.body,

316 ]

317 elif theme == 'alabaster':

318 containers = [

319 soup.find( 'div', { 'class': 'body' } ),

320 soup.find( 'div', { 'class': 'content' } ),

321 ]

322 else: # Generic fallback for unknown themes

323 containers = [

324 soup.find( 'article', { 'role': 'main' } ), # Furo theme

325 soup.find( 'div', { 'class': 'body' } ), # Basic theme

326 soup.find( 'div', { 'class': 'content' } ), # Nature theme

327 soup.find( 'div', { 'class': 'main' } ), # Generic main

328 soup.find( 'main' ), # HTML5 main element

329 soup.find( 'div', { 'role': 'main' } ), # Role-based

330 soup.body, # Fallback to body if nothing else works

331 ]

332 for container in containers:

333 if container: return container

334 return __.absent

335

336

337def _clean_extracted_text( text: str ) -> str:

338 ''' Cleans extracted text while preserving internal spacing. '''

339 # Remove leading/trailing whitespace but preserve internal spaces

340 text = text.strip( )

341 # Normalize multiple spaces to single spaces

342 text = __.re.sub( r' +', ' ', text )

343 # Remove excessive newlines but preserve paragraph breaks

344 return __.re.sub( r'\n\s*\n', '\n\n', text )

345

346

347def _generic_extraction( element: __.typx.Any ) -> tuple[ str, str ]:

348 ''' Generic fallback extraction for unknown element types. '''

349 signature = _clean_extracted_text( element.get_text( ) )

350 description = ''

351 if element.parent:

352 next_p = element.parent.find( 'p' )

353 if next_p:

354 description = str( next_p )

355 return signature, description

356

357

358def _get_description_by_source_type(

359 element: __.typx.Any,

360 source_type: str,

361 element_type: str

362) -> str:

363 ''' Gets description content based on source type. '''

364 match source_type:

365 case 'next_sibling':

366 return _get_sibling_text( element, element_type )

367 case 'parent_next_sibling':

368 return _get_parent_sibling_text( element, element_type )

369 case 'parent_next_element':

370 return _get_parent_element_text( element, element_type )

371 case 'parent_content':

372 return _get_parent_content_text( element, element_type )

373 case 'first_paragraph':

374 return _get_first_paragraph_text( element )

375 case _: return ''

376

377

378def _get_first_paragraph_text( element: __.typx.Any ) -> str:

379 ''' Gets HTML content from first paragraph within element. '''

380 paragraph = element.find( 'p' )

381 return str( paragraph ) if paragraph else ''

382

383

384def _get_parent_content_text( element: __.typx.Any, element_type: str ) -> str:

385 ''' Gets HTML content from content element within parent. '''

386 if element.parent:

387 content_elem = element.parent.find( element_type )

388 return content_elem.decode_contents( ) if content_elem else ''

389 return ''

390

391

392def _get_parent_element_text( element: __.typx.Any, element_type: str ) -> str:

393 ''' Gets HTML content from element within parent. '''

394 if element.parent:

395 next_elem = element.parent.find( element_type )

396 return next_elem.decode_contents( ) if next_elem else ''

397 return ''

398

399

400def _get_parent_sibling_text( element: __.typx.Any, element_type: str ) -> str:

401 ''' Gets HTML content from parent's next sibling element. '''

402 if element.parent:

403 sibling = element.parent.find_next_sibling( element_type )

404 return sibling.decode_contents( ) if sibling else ''

405 return ''

406

407

408def _get_sibling_text( element: __.typx.Any, element_type: str ) -> str:

409 ''' Gets HTML content from next sibling element. '''

410 sibling = element.find_next_sibling( element_type )

411 return sibling.decode_contents( ) if sibling else ''