Coverage for sources/librovore/cacheproxy.py: 87%

411 statements  

« prev     ^ index     » next       coverage.py v7.10.4, created at 2025-08-20 22:48 +0000

1# vim: set filetype=python fileencoding=utf-8: 

2# -*- coding: utf-8 -*- 

3 

4#============================================================================# 

5# # 

6# Licensed under the Apache License, Version 2.0 (the "License"); # 

7# you may not use this file except in compliance with the License. # 

8# You may obtain a copy of the License at # 

9# # 

10# http://www.apache.org/licenses/LICENSE-2.0 # 

11# # 

12# Unless required by applicable law or agreed to in writing, software # 

13# distributed under the License is distributed on an "AS IS" BASIS, # 

14# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # 

15# See the License for the specific language governing permissions and # 

16# limitations under the License. # 

17# # 

18#============================================================================# 

19 

20 

21''' HTTP cache for documentation URL access. ''' 

22 

23 

24from http import HTTPStatus as _HttpStatus 

25from urllib.parse import ParseResult as _Url 

26from urllib.robotparser import RobotFileParser as _RobotFileParser 

27 

28import appcore.generics as _generics 

29import httpx as _httpx 

30 

31from . import __ 

32from . import exceptions as _exceptions 

33 

34 

35HttpClientFactory: __.typx.TypeAlias = ( 

36 __.cabc.Callable[ [ ], _httpx.AsyncClient ] ) 

37ContentResponse: __.typx.TypeAlias = _generics.Result[ bytes, Exception ] 

38ProbeResponse: __.typx.TypeAlias = _generics.Result[ bool, Exception ] 

39RobotsResponse: __.typx.TypeAlias = ( 

40 _generics.Result[ _RobotFileParser, Exception ] ) 

41 

42 

43class CacheEntry( __.immut.DataclassObject ): 

44 ''' Cache entry base. ''' 

45 

46 timestamp: float 

47 ttl: float 

48 

49 @property 

50 def invalid( self ) -> bool: 

51 ''' Checks if cache entry has exceeded its TTL. ''' 

52 return __.time.time( ) - self.timestamp > self.ttl 

53 

54 

55class ContentCacheEntry( CacheEntry ): 

56 ''' Cache entry for URL content with size tracking. ''' 

57 

58 response: ContentResponse 

59 headers: _httpx.Headers 

60 size_bytes: int 

61 

62 @property 

63 def memory_usage( self ) -> int: 

64 ''' Calculates total memory usage including metadata. ''' 

65 return self.size_bytes + 100 # Overhead estimate 

66 

67 

68class ProbeCacheEntry( CacheEntry ): 

69 ''' Cache entry for URL probe results. ''' 

70 

71 response: ProbeResponse 

72 

73 

74class RobotsCacheEntry( CacheEntry ): 

75 ''' Cache entry for robots.txt parser. ''' 

76 

77 response: RobotsResponse 

78 

79 

80class Cache( __.immut.Object ): 

81 ''' Cache base with shared configuration attributes. ''' 

82 

83 error_ttl: float = 30.0 

84 success_ttl: float = 300.0 

85 

86 def __init__( 

87 self, *, 

88 error_ttl: __.Absential[ float ] = __.absent, 

89 success_ttl: __.Absential[ float ] = __.absent, 

90 delay_function: __.cabc.Callable[ 

91 [ float ], __.cabc.Awaitable[ None ] 

92 ] = __.asyncio.sleep 

93 ) -> None: 

94 if not __.is_absent( error_ttl ): self.error_ttl = error_ttl 

95 if not __.is_absent( success_ttl ): self.success_ttl = success_ttl 

96 self.delay_function = delay_function 

97 self._request_mutexes: dict[ str, __.asyncio.Lock ] = { } 

98 

99 @__.ctxl.asynccontextmanager 

100 async def acquire_mutex_for( self, url: str ): 

101 ''' Acquires mutex for HTTP request deduplication. ''' 

102 if url not in self._request_mutexes: # pragma: no branch 

103 self._request_mutexes[ url ] = __.asyncio.Lock( ) 

104 mutex = self._request_mutexes[ url ] 

105 async with mutex: 

106 try: yield 

107 finally: self._request_mutexes.pop( url, None ) 

108 

109 

110class RobotsCache( Cache ): 

111 ''' Cache manager for robots.txt files with crawl delay tracking. ''' 

112 

113 entries_max: int = 500 

114 request_timeout: float = 5.0 

115 ttl: float = 3600.0 

116 user_agent: str = '*' 

117 

118 def __init__( 

119 self, *, 

120 entries_max: __.Absential[ int ] = __.absent, 

121 ttl: __.Absential[ float ] = __.absent, 

122 request_timeout: __.Absential[ float ] = __.absent, 

123 user_agent: __.Absential[ str ] = __.absent, 

124 **base_initargs: __.typx.Any 

125 ) -> None: 

126 super( ).__init__( **base_initargs ) 

127 if not __.is_absent( entries_max ): self.entries_max = entries_max 

128 if not __.is_absent( ttl ): self.ttl = ttl 

129 if not __.is_absent( request_timeout ): 

130 self.request_timeout = request_timeout 

131 if not __.is_absent( user_agent ): self.user_agent = user_agent 

132 self._cache: dict[ str, RobotsCacheEntry ] = { } 

133 self._recency: __.collections.deque[ str ] = __.collections.deque( ) 

134 self._request_delays: dict[ str, float ] = { } 

135 

136 @classmethod 

137 def from_configuration( 

138 cls, configuration: __.cabc.Mapping[ str, __.typx.Any ] 

139 ) -> __.typx.Self: 

140 ''' Creates RobotsCache instance from application configuration. ''' 

141 cache_config = configuration.get( 'cache', { } ) 

142 robots_ttl = cache_config.get( 'robots-ttl', 3600.0 ) 

143 return cls( ttl = robots_ttl ) 

144 

145 async def access( self, domain: str ) -> __.Absential[ _RobotFileParser ]: 

146 ''' Retrieves cached robots.txt parser if valid. ''' 

147 if domain not in self._cache: return __.absent 

148 entry = self._cache[ domain ] 

149 if entry.invalid: 

150 self._remove( domain ) 

151 return __.absent 

152 self._record_access( domain ) 

153 return entry.response.extract( ) 

154 

155 def assign_delay( self, domain: str, delay_seconds: float ) -> None: 

156 ''' Sets next allowed request time for domain. ''' 

157 self._request_delays[ domain ] = __.time.time( ) + delay_seconds 

158 

159 def calculate_delay_remainder( self, domain: str ) -> float: 

160 ''' Returns remaining crawl delay time for domain. ''' 

161 allow_at = self._request_delays.get( domain, 0.0 ) 

162 if not allow_at: return 0.0 

163 remainder = allow_at - __.time.time( ) 

164 return max( 0.0, remainder ) 

165 

166 def determine_ttl( self, response: RobotsResponse ) -> float: 

167 ''' Determines appropriate TTL based on response type. ''' 

168 if response.is_value( ): return self.ttl 

169 return self.error_ttl 

170 

171 async def store( 

172 self, domain: str, response: RobotsResponse, ttl: float 

173 ) -> None: 

174 ''' Stores robots.txt parser in cache. ''' 

175 entry = RobotsCacheEntry( 

176 response = response, timestamp = __.time.time( ), ttl = ttl ) 

177 self._cache[ domain ] = entry 

178 self._record_access( domain ) 

179 self._evict_by_count( ) 

180 

181 def _evict_by_count( self ) -> None: 

182 ''' Evicts oldest entries when cache exceeds max size. ''' 

183 while ( 

184 len( self._cache ) > self.entries_max 

185 and self._recency 

186 ): 

187 lru_domain = self._recency.popleft( ) 

188 if lru_domain in self._cache: # pragma: no branch 

189 del self._cache[ lru_domain ] 

190 

191 def _record_access( self, domain: str ) -> None: 

192 ''' Updates LRU access order for given domain. ''' 

193 with __.ctxl.suppress( ValueError ): 

194 self._recency.remove( domain ) 

195 self._recency.append( domain ) 

196 

197 def _remove( self, domain: str ) -> None: 

198 ''' Removes entry from cache. ''' 

199 self._cache.pop( domain, None ) 

200 with __.ctxl.suppress( ValueError ): 

201 self._recency.remove( domain ) 

202 

203 

204class ContentCache( Cache, instances_mutables = ( '_memory_total', ) ): 

205 ''' Cache manager for URL content (GET requests) with memory tracking. ''' 

206 

207 memory_max: int = 32 * 1024 * 1024 

208 

209 def __init__( 

210 self, *, 

211 robots_cache: __.Absential[ RobotsCache ] = __.absent, 

212 memory_max: __.Absential[ int ] = __.absent, 

213 **base_initargs: __.typx.Any 

214 ) -> None: 

215 super( ).__init__( **base_initargs ) 

216 if __.is_absent( robots_cache ): 

217 self.robots_cache = RobotsCache( **base_initargs ) 

218 else: self.robots_cache = robots_cache 

219 if not __.is_absent( memory_max ): self.memory_max = memory_max 

220 self._cache: dict[ str, ContentCacheEntry ] = { } 

221 self._memory_total = 0 

222 self._recency: __.collections.deque[ str ] = __.collections.deque( ) 

223 

224 @classmethod 

225 def from_configuration( 

226 cls, 

227 configuration: __.cabc.Mapping[ str, __.typx.Any ], 

228 robots_cache: __.Absential[ RobotsCache ] = __.absent 

229 ) -> __.typx.Self: 

230 ''' Creates ContentCache instance from application configuration. ''' 

231 cache_config = configuration.get( 'cache', { } ) 

232 content_ttl = cache_config.get( 'content-ttl', 300.0 ) 

233 memory_limit = cache_config.get( 'memory-limit', 33554432 ) 

234 nomargs = { 

235 'success_ttl': content_ttl, 

236 'memory_max': memory_limit, 

237 } 

238 if not __.is_absent( robots_cache ): 

239 nomargs[ 'robots_cache' ] = robots_cache 

240 return cls( **nomargs ) 

241 

242 async def access( 

243 self, url: str 

244 ) -> __.Absential[ tuple[ bytes, _httpx.Headers ] ]: 

245 ''' Retrieves cached content if valid. ''' 

246 if url not in self._cache: return __.absent 

247 entry = self._cache[ url ] 

248 if entry.invalid: 

249 self._remove( url ) 

250 return __.absent 

251 self._record_access( url ) 

252 return ( entry.response.extract( ), entry.headers ) 

253 

254 def determine_ttl( self, response: ContentResponse ) -> float: 

255 ''' Determines appropriate TTL based on response type. ''' 

256 if response.is_value( ): 

257 return self.success_ttl 

258 # TODO: Inspect exception type for more granular TTL 

259 return self.error_ttl 

260 

261 async def retrieve_url( 

262 self, 

263 url: _Url, /, *, 

264 duration_max: float = 30.0, 

265 client_factory: HttpClientFactory = _httpx.AsyncClient, 

266 ) -> bytes: 

267 ''' Convenience method for retrieving URL content. ''' 

268 return await retrieve_url( 

269 self, url, 

270 duration_max = duration_max, 

271 client_factory = client_factory ) 

272 

273 async def store( 

274 self, url: str, response: ContentResponse, 

275 headers: _httpx.Headers, ttl: float 

276 ) -> None: 

277 ''' Stores content in cache with memory management. ''' 

278 size_bytes = self._calculate_response_size( response ) 

279 entry = ContentCacheEntry( 

280 response = response, 

281 headers = headers, 

282 timestamp = __.time.time( ), 

283 ttl = ttl, 

284 size_bytes = size_bytes ) 

285 if old_entry := self._cache.get( url ): 

286 self._memory_total -= old_entry.memory_usage 

287 self._cache[ url ] = entry 

288 self._memory_total += entry.memory_usage 

289 self._record_access( url ) 

290 self._evict_by_memory( ) 

291 

292 def _calculate_response_size( self, response: ContentResponse ) -> int: 

293 ''' Calculates memory footprint of cached response. ''' 

294 if response.is_value( ): 

295 content = response.extract( ) 

296 return len( content ) 

297 return 100 # Conservative estimate for exception overhead 

298 

299 def _evict_by_memory( self ) -> None: 

300 ''' Evicts LRU entries until memory usage is under limit. ''' 

301 while ( 

302 self._memory_total > self.memory_max 

303 and self._recency 

304 ): 

305 lru_url = self._recency.popleft( ) 

306 if lru_url in self._cache: # pragma: no branch 

307 entry = self._cache[ lru_url ] 

308 self._memory_total -= entry.memory_usage 

309 del self._cache[ lru_url ] 

310 _scribe.debug( f"Evicted cache entry: {lru_url}" ) 

311 

312 def _record_access( self, url: str ) -> None: 

313 ''' Updates LRU access order for given URL. ''' 

314 with __.ctxl.suppress( ValueError ): 

315 self._recency.remove( url ) 

316 self._recency.append( url ) 

317 

318 def _remove( self, url: str ) -> None: 

319 ''' Removes entry from cache and updates memory tracking. ''' 

320 if entry := self._cache.pop( url, None ): 

321 self._memory_total -= entry.memory_usage 

322 with __.ctxl.suppress( ValueError ): 

323 self._recency.remove( url ) 

324 

325 

326class ProbeCache( Cache ): 

327 ''' Cache manager for URL probe results (HEAD requests). ''' 

328 

329 entries_max: int = 1000 

330 

331 def __init__( 

332 self, *, 

333 robots_cache: __.Absential[ RobotsCache ] = __.absent, 

334 entries_max: __.Absential[ int ] = __.absent, 

335 **base_initargs: __.typx.Any 

336 ) -> None: 

337 super( ).__init__( **base_initargs ) 

338 if __.is_absent( robots_cache ): 

339 self.robots_cache = RobotsCache( **base_initargs ) 

340 else: self.robots_cache = robots_cache 

341 if not __.is_absent( entries_max ): self.entries_max = entries_max 

342 self._cache: dict[ str, ProbeCacheEntry ] = { } 

343 self._recency: __.collections.deque[ str ] = __.collections.deque( ) 

344 

345 @classmethod 

346 def from_configuration( 

347 cls, 

348 configuration: __.cabc.Mapping[ str, __.typx.Any ], 

349 robots_cache: __.Absential[ RobotsCache ] = __.absent 

350 ) -> __.typx.Self: 

351 ''' Creates ProbeCache instance from application configuration. ''' 

352 cache_config = configuration.get( 'cache', { } ) 

353 probe_ttl = cache_config.get( 'probe-ttl', 300.0 ) 

354 nomargs = { 'success_ttl': probe_ttl } 

355 if not __.is_absent( robots_cache ): 

356 nomargs[ 'robots_cache' ] = robots_cache 

357 return cls( **nomargs ) 

358 

359 async def access( self, url: str ) -> __.Absential[ bool ]: 

360 ''' Retrieves cached probe result if valid. ''' 

361 if url not in self._cache: return __.absent 

362 entry = self._cache[ url ] 

363 if entry.invalid: 

364 self._remove( url ) 

365 return __.absent 

366 self._record_access( url ) 

367 return entry.response.extract( ) 

368 

369 def determine_ttl( self, response: ProbeResponse ) -> float: 

370 ''' Determines appropriate TTL based on response type. ''' 

371 if response.is_value( ): 

372 return self.success_ttl 

373 # TODO: Inspect exception type for more granular TTL 

374 return self.error_ttl 

375 

376 async def probe_url( 

377 self, 

378 url: _Url, /, *, 

379 duration_max: float = 10.0, 

380 client_factory: HttpClientFactory = _httpx.AsyncClient, 

381 ) -> bool: 

382 ''' Convenience method for probing URL existence. ''' 

383 return await probe_url( 

384 self, url, 

385 duration_max = duration_max, 

386 client_factory = client_factory ) 

387 

388 async def store( 

389 self, url: str, response: ProbeResponse, ttl: float 

390 ) -> None: 

391 ''' Stores probe result in cache. ''' 

392 entry = ProbeCacheEntry( 

393 response = response, 

394 timestamp = __.time.time( ), 

395 ttl = ttl ) 

396 self._cache[ url ] = entry 

397 self._record_access( url ) 

398 self._evict_by_count( ) 

399 

400 def _evict_by_count( self ) -> None: 

401 ''' Evicts oldest entries when cache exceeds max size. ''' 

402 while ( 

403 len( self._cache ) > self.entries_max 

404 and self._recency 

405 ): 

406 lru_url = self._recency.popleft( ) 

407 if lru_url in self._cache: # pragma: no branch 

408 del self._cache[ lru_url ] 

409 

410 def _record_access( self, url: str ) -> None: 

411 ''' Updates LRU access order for given URL. ''' 

412 with __.ctxl.suppress( ValueError ): 

413 self._recency.remove( url ) 

414 self._recency.append( url ) 

415 

416 def _remove( self, url: str ) -> None: 

417 ''' Removes entry from cache. ''' 

418 self._cache.pop( url, None ) 

419 with __.ctxl.suppress( ValueError ): 

420 self._recency.remove( url ) 

421 

422 

423_http_success_threshold = 400 

424 

425 

426class CacheContext( __.immut.DataclassObject ): 

427 ''' Context carrying configured cache instances. ''' 

428 

429 content_cache: ContentCache 

430 probe_cache: ProbeCache 

431 robots_cache: RobotsCache 

432 

433 @classmethod 

434 def from_configuration( 

435 cls, 

436 configuration: __.cabc.Mapping[ str, __.typx.Any ] 

437 ) -> __.typx.Self: 

438 ''' Creates cache context from application configuration. ''' 

439 robots_cache = RobotsCache.from_configuration( configuration ) 

440 return cls( 

441 content_cache = ContentCache.from_configuration( 

442 configuration, robots_cache ), 

443 probe_cache = ProbeCache.from_configuration( 

444 configuration, robots_cache ), 

445 robots_cache = robots_cache, 

446 ) 

447 

448 

449_scribe = __.acquire_scribe( __name__ ) 

450 

451 

452def prepare( 

453 auxdata: __.Globals 

454) -> tuple[ ContentCache, ProbeCache, RobotsCache ]: 

455 ''' Prepares cache instances from configuration. 

456 

457 Returns cache instances constructed from application configuration. 

458 ''' 

459 configuration = auxdata.configuration 

460 robots_cache = RobotsCache.from_configuration( configuration ) 

461 return ( 

462 ContentCache.from_configuration( configuration, robots_cache ), 

463 ProbeCache.from_configuration( configuration, robots_cache ), 

464 robots_cache, 

465 ) 

466 

467 

468async def probe_url( 

469 cache: ProbeCache, 

470 url: _Url, *, 

471 duration_max: float = 10.0, 

472 client_factory: HttpClientFactory = _httpx.AsyncClient, 

473) -> bool: 

474 ''' Cached HEAD request to check URL existence. ''' 

475 url_s = url.geturl( ) 

476 match url.scheme: 

477 case '' | 'file': 

478 return __.Path( url.path ).exists( ) 

479 case 'http' | 'https': 

480 result = await cache.access( url_s ) 

481 if not __.is_absent( result ): return result 

482 async with client_factory( ) as client: 

483 result = await _probe_url( 

484 url, duration_max = duration_max, 

485 client = client, 

486 probe_cache = cache, 

487 robots_cache = cache.robots_cache ) 

488 ttl = cache.determine_ttl( result ) 

489 await cache.store( url_s, result, ttl ) 

490 return result.extract( ) 

491 case _: return False 

492 

493 

494async def retrieve_url( 

495 cache: ContentCache, 

496 url: _Url, *, 

497 duration_max: float = 30.0, 

498 client_factory: HttpClientFactory = _httpx.AsyncClient, 

499) -> bytes: 

500 ''' Cached GET request to fetch URL content as bytes. ''' 

501 url_s = url.geturl( ) 

502 match url.scheme: 

503 case '' | 'file': 

504 location = __.Path( url.path ) 

505 try: return location.read_bytes( ) 

506 except Exception as exc: 

507 raise _exceptions.DocumentationInaccessibility( 

508 url_s, exc ) from exc 

509 case 'http' | 'https': 

510 result = await cache.access( url_s ) 

511 if not __.is_absent( result ): 

512 content_bytes, _ = result 

513 return content_bytes 

514 async with client_factory( ) as client: 

515 result, headers = await _retrieve_url( 

516 url, 

517 duration_max = duration_max, 

518 client = client, 

519 content_cache = cache, 

520 robots_cache = cache.robots_cache ) 

521 ttl = cache.determine_ttl( result ) 

522 await cache.store( url_s, result, headers, ttl ) 

523 return result.extract( ) 

524 case _: 

525 raise _exceptions.DocumentationInaccessibility( 

526 url_s, f"Unsupported scheme: {url.scheme}" ) 

527 

528 

529async def retrieve_url_as_text( 

530 cache: ContentCache, 

531 url: _Url, *, 

532 duration_max: float = 30.0, 

533 charset_default: str = 'utf-8', 

534 client_factory: HttpClientFactory = _httpx.AsyncClient, 

535) -> str: 

536 ''' Cached GET request to fetch URL content as text. ''' 

537 url_s = url.geturl( ) 

538 match url.scheme: 

539 case '' | 'file': 

540 location = __.Path( url.path ) 

541 try: content_bytes = location.read_bytes( ) 

542 except Exception as exc: 

543 raise _exceptions.DocumentationInaccessibility( 

544 url_s, exc ) from exc 

545 mimetype, charset = __.detext.detect_mimetype_and_charset( 

546 content_bytes, location ) 

547 if not __.detext.is_textual_content( content_bytes ): 547 ↛ 548line 547 didn't jump to line 548 because the condition on line 547 was never true

548 raise _exceptions.DocumentationInaccessibility( 

549 url_s, "Content analysis indicates non-textual data" ) 

550 encoding = charset or charset_default 

551 return content_bytes.decode( encoding ) 

552 case 'http' | 'https': 

553 result = await cache.access( url_s ) 

554 if not __.is_absent( result ): 

555 content_bytes, headers = result 

556 _validate_textual_content( 

557 content_bytes, headers, url_s ) 

558 charset = _detect_charset_with_fallback( 

559 content_bytes, headers, charset_default ) 

560 return content_bytes.decode( charset ) 

561 async with client_factory( ) as client: 

562 result, headers = await _retrieve_url( 

563 url, duration_max = duration_max, 

564 client = client, 

565 content_cache = cache, 

566 robots_cache = cache.robots_cache ) 

567 ttl = cache.determine_ttl( result ) 

568 await cache.store( url_s, result, headers, ttl ) 

569 content_bytes = result.extract( ) 

570 _validate_textual_content( 

571 content_bytes, headers, url_s ) 

572 charset = _detect_charset_with_fallback( 

573 content_bytes, headers, charset_default ) 

574 return content_bytes.decode( charset ) 

575 case _: 

576 raise _exceptions.DocumentationInaccessibility( 

577 url_s, f"Unsupported scheme: {url.scheme}" ) 

578 

579 

580async def _apply_request_delay( 

581 url: _Url, 

582 client: _httpx.AsyncClient, 

583 cache: RobotsCache, 

584) -> None: 

585 ''' Applies crawl delay to request if specified in robots.txt. ''' 

586 if url.scheme not in ( 'http', 'https' ): return 586 ↛ exitline 586 didn't return from function '_apply_request_delay' because the return on line 586 wasn't executed

587 domain = _extract_domain( url ) 

588 delay = cache.calculate_delay_remainder( domain ) 

589 if delay > 0: await cache.delay_function( delay ) 

590 parser = await cache.access( domain ) 

591 if __.is_absent( parser ): 591 ↛ 592line 591 didn't jump to line 592 because the condition on line 591 was never true

592 parser = await _retrieve_robots_txt( client, cache, domain ) 

593 if not __.is_absent( parser ): 593 ↛ exitline 593 didn't return from function '_apply_request_delay' because the condition on line 593 was always true

594 try: delay = parser.crawl_delay( cache.user_agent ) 

595 except Exception as exc: 

596 _scribe.debug( f"Failed to get crawl delay for {domain}: {exc}" ) 

597 else: 

598 if delay: cache.assign_delay( domain, float( delay ) ) 

599 

600 

601async def _cache_robots_txt_error( 

602 domain: str, cache: RobotsCache, error: Exception 

603) -> __.Absential[ _RobotFileParser ]: 

604 _scribe.debug( f"Failed to fetch/parse robots.txt from {domain}: {error}" ) 

605 result: RobotsResponse = _generics.Error( error ) 

606 return await _cache_robots_txt_result( cache, domain, result ) 

607 

608 

609async def _cache_robots_txt_result( 

610 cache: RobotsCache, domain: str, result: RobotsResponse 

611) -> __.Absential[ _RobotFileParser ]: 

612 ttl = cache.determine_ttl( result ) 

613 await cache.store( domain, result, ttl ) 

614 return result.extract( ) if result.is_value( ) else __.absent 

615 

616 

617async def _check_robots_txt( 

618 url: _Url, *, 

619 client: _httpx.AsyncClient, 

620 cache: RobotsCache, 

621) -> bool: 

622 ''' Checks if URL is allowed by robots.txt. ''' 

623 if url.scheme not in ( 'http', 'https' ): return True 623 ↛ exitline 623 didn't return from function '_check_robots_txt' because the return on line 623 wasn't executed

624 url_s = url.geturl( ) 

625 domain = _extract_domain( url ) 

626 parser = await cache.access( domain ) 

627 if __.is_absent( parser ): 627 ↛ 630line 627 didn't jump to line 630 because the condition on line 627 was always true

628 parser = await _retrieve_robots_txt( client, cache, domain ) 

629 if __.is_absent( parser ): return True 

630 try: return parser.can_fetch( cache.user_agent, url_s ) 

631 except Exception as exc: 

632 _scribe.debug( f"robots.txt check failed for {url_s}: {exc}" ) 

633 return True # if no robots.txt, then assume URL allowed 

634 

635 

636def _detect_charset_with_fallback( 

637 content: bytes, headers: _httpx.Headers, default: str 

638) -> str: 

639 ''' Detects charset from headers with content-based fallback. ''' 

640 header_charset = _extract_charset_from_headers( headers, '' ) 

641 if header_charset: 

642 return header_charset 

643 detected_charset = __.detext.detect_charset( content ) 

644 return detected_charset or default 

645 

646 

647def _detect_mimetype_with_fallback( 

648 content: bytes, headers: _httpx.Headers, url: str 

649) -> str: 

650 ''' Detects MIME type from headers with content-based fallback. ''' 

651 header_mimetype = _extract_mimetype_from_headers( headers ) 

652 if header_mimetype: 652 ↛ 654line 652 didn't jump to line 654 because the condition on line 652 was always true

653 return header_mimetype 

654 return __.detext.detect_mimetype( content, url ) or '' 

655 

656 

657def _extract_charset_from_headers( 

658 headers: _httpx.Headers, default: str 

659) -> str: 

660 ''' Extracts charset from Content-Type header. ''' 

661 content_type = headers.get( 'content-type', '' ) 

662 if isinstance( content_type, str ) and ';' in content_type: 

663 _, _, params = content_type.partition( ';' ) 

664 if 'charset=' in params: 

665 charset = params.split( 'charset=' )[ -1 ].strip( ) 

666 return charset.strip( '"\\\'\"' ) 

667 return default 

668 

669 

670def _extract_domain( url: _Url ) -> str: 

671 ''' Extracts domain from URL for robots.txt caching. ''' 

672 return f"{url.scheme}://{url.netloc}" 

673 

674 

675def _extract_mimetype_from_headers( headers: _httpx.Headers ) -> str: 

676 ''' Extracts mimetype from Content-Type header. ''' 

677 content_type = headers.get( 'content-type', '' ) 

678 if isinstance( content_type, str ) and ';' in content_type: 

679 mimetype, _, _ = content_type.partition( ';' ) 

680 return mimetype.strip( ) 

681 return content_type 

682 

683 

684def _raise_non_textual_content( url: str ) -> None: 

685 ''' Raises exception for non-textual content. ''' 

686 raise _exceptions.DocumentationInaccessibility( 

687 url, "Content analysis indicates non-textual data" ) 

688 

689 

690def _raise_non_textual_mimetype( url: str, mimetype: str ) -> None: 

691 ''' Raises exception for non-textual MIME type. ''' 

692 raise _exceptions.DocumentationInaccessibility( 

693 url, f"Non-textual content detected: {mimetype}" ) 

694 

695 

696 

697async def _probe_url( 

698 url: _Url, /, *, 

699 duration_max: float, 

700 client: _httpx.AsyncClient, 

701 probe_cache: ProbeCache, 

702 robots_cache: RobotsCache, 

703) -> ProbeResponse: 

704 ''' Makes HEAD request with deduplication. ''' 

705 url_s = url.geturl( ) 

706 if not await _check_robots_txt( 706 ↛ 709line 706 didn't jump to line 709 because the condition on line 706 was never true

707 url, client = client, cache = robots_cache 

708 ): 

709 _scribe.debug( f"URL blocked by robots.txt: {url_s}" ) 

710 return _generics.Error( _exceptions.UrlImpermissibility( 

711 url_s, robots_cache.user_agent ) ) 

712 await _apply_request_delay( url, cache = robots_cache, client = client ) 

713 async with probe_cache.acquire_mutex_for( url_s ): 

714 try: 

715 response = await client.head( 

716 url_s, timeout = duration_max, follow_redirects = True ) 

717 except Exception as exc: 

718 _scribe.debug( f"HEAD request failed for {url_s}: {exc}" ) 

719 return _generics.Error( exc ) 

720 else: 

721 return _generics.Value( 

722 response.status_code < _http_success_threshold ) 

723 

724 

725async def _retrieve_robots_txt( 

726 client: _httpx.AsyncClient, cache: RobotsCache, domain: str 

727) -> __.Absential[ _RobotFileParser ]: 

728 ''' Fetches and parses robots.txt for domain. ''' 

729 robots_url = f"{domain}/robots.txt" 

730 async with cache.acquire_mutex_for( domain ): 

731 timeout = cache.request_timeout 

732 try: 

733 response = await client.get( 

734 robots_url, timeout = timeout, follow_redirects = True ) 

735 except Exception as exc: 

736 return await _cache_robots_txt_error( domain, cache, exc ) 

737 match response.status_code: 

738 case _HttpStatus.OK: lines = response.text.splitlines( ) 738 ↛ 739line 738 didn't jump to line 739 because the pattern on line 738 always matched

739 case _HttpStatus.NOT_FOUND: lines = [ ] 

740 case _: 

741 try: response.raise_for_status( ) 

742 except Exception as exc: 

743 return await _cache_robots_txt_error( domain, cache, exc ) 

744 robots_parser = _RobotFileParser( ) 

745 robots_parser.set_url( robots_url ) 

746 try: robots_parser.parse( lines ) 

747 except Exception as exc: 

748 return await _cache_robots_txt_error( domain, cache, exc ) 

749 result: RobotsResponse = _generics.Value( robots_parser ) 

750 return await _cache_robots_txt_result( cache, domain, result ) 

751 

752 

753async def _retrieve_url( 

754 url: _Url, /, *, 

755 duration_max: float, 

756 client: _httpx.AsyncClient, 

757 content_cache: ContentCache, 

758 robots_cache: RobotsCache, 

759) -> tuple[ ContentResponse, _httpx.Headers ]: 

760 ''' Makes GET request with deduplication. ''' 

761 url_s = url.geturl( ) 

762 if not await _check_robots_txt( 762 ↛ 765line 762 didn't jump to line 765 because the condition on line 762 was never true

763 url, cache = robots_cache, client = client 

764 ): 

765 return ( 

766 _generics.Error( _exceptions.UrlImpermissibility( 

767 url_s, robots_cache.user_agent ) ), 

768 _httpx.Headers( ) ) 

769 await _apply_request_delay( url, cache = robots_cache, client = client ) 

770 async with content_cache.acquire_mutex_for( url_s ): 

771 try: 

772 response = await client.get( 

773 url_s, timeout = duration_max, follow_redirects = True ) 

774 response.raise_for_status( ) 

775 except Exception as exc: 

776 _scribe.debug( f"GET request failed for {url_s}: {exc}" ) 

777 return _generics.Error( exc ), _httpx.Headers( ) 

778 else: return _generics.Value( response.content ), response.headers 

779 

780 

781def _validate_textual_content( 

782 content: bytes, headers: _httpx.Headers, url: str 

783) -> None: 

784 ''' Validates that content is textual via headers and content analysis. ''' 

785 mimetype = _detect_mimetype_with_fallback( content, headers, url ) 

786 if mimetype and not __.detext.is_textual_mimetype( mimetype ): 

787 raise _exceptions.HttpContentTypeInvalidity( 

788 url, mimetype, "text decoding" ) 

789 if not __.detext.is_textual_content( content ): 789 ↛ 790line 789 didn't jump to line 790 because the condition on line 789 was never true

790 raise _exceptions.HttpContentTypeInvalidity( 

791 url, mimetype or 'unknown', "content analysis" )