Coverage for sources/librovore/cacheproxy.py: 87%

399 statements  

« prev     ^ index     » next       coverage.py v7.10.6, created at 2025-09-06 02:25 +0000

1# vim: set filetype=python fileencoding=utf-8: 

2# -*- coding: utf-8 -*- 

3 

4#============================================================================# 

5# # 

6# Licensed under the Apache License, Version 2.0 (the "License"); # 

7# you may not use this file except in compliance with the License. # 

8# You may obtain a copy of the License at # 

9# # 

10# http://www.apache.org/licenses/LICENSE-2.0 # 

11# # 

12# Unless required by applicable law or agreed to in writing, software # 

13# distributed under the License is distributed on an "AS IS" BASIS, # 

14# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # 

15# See the License for the specific language governing permissions and # 

16# limitations under the License. # 

17# # 

18#============================================================================# 

19 

20 

21''' HTTP cache for documentation URL access. ''' 

22 

23 

24from http import HTTPStatus as _HttpStatus 

25from urllib.parse import ParseResult as _Url 

26from urllib.robotparser import RobotFileParser as _RobotFileParser 

27 

28import appcore.generics as _generics 

29import httpx as _httpx 

30 

31from . import __ 

32from . import exceptions as _exceptions 

33 

34 

35HttpClientFactory: __.typx.TypeAlias = ( 

36 __.cabc.Callable[ [ ], _httpx.AsyncClient ] ) 

37ContentResponse: __.typx.TypeAlias = _generics.Result[ bytes, Exception ] 

38ProbeResponse: __.typx.TypeAlias = _generics.Result[ bool, Exception ] 

39RobotsResponse: __.typx.TypeAlias = ( 

40 _generics.Result[ _RobotFileParser, Exception ] ) 

41 

42 

43class CacheEntry( __.immut.DataclassObject ): 

44 ''' Cache entry base. ''' 

45 

46 timestamp: float 

47 ttl: float 

48 

49 @property 

50 def invalid( self ) -> bool: 

51 ''' Checks if cache entry has exceeded its TTL. ''' 

52 return __.time.time( ) - self.timestamp > self.ttl 

53 

54 

55class ContentCacheEntry( CacheEntry ): 

56 ''' Cache entry for URL content with size tracking. ''' 

57 

58 response: ContentResponse 

59 headers: _httpx.Headers 

60 size_bytes: int 

61 

62 @property 

63 def memory_usage( self ) -> int: 

64 ''' Calculates total memory usage including metadata. ''' 

65 return self.size_bytes + 100 # Overhead estimate 

66 

67 

68class ProbeCacheEntry( CacheEntry ): 

69 ''' Cache entry for URL probe results. ''' 

70 

71 response: ProbeResponse 

72 

73 

74class RobotsCacheEntry( CacheEntry ): 

75 ''' Cache entry for robots.txt parser. ''' 

76 

77 response: RobotsResponse 

78 

79 

80class Cache( __.immut.Object ): 

81 ''' Cache base with shared configuration attributes. ''' 

82 

83 error_ttl: float = 30.0 

84 success_ttl: float = 300.0 

85 

86 def __init__( 

87 self, *, 

88 error_ttl: __.Absential[ float ] = __.absent, 

89 success_ttl: __.Absential[ float ] = __.absent, 

90 delay_function: __.cabc.Callable[ 

91 [ float ], __.cabc.Awaitable[ None ] 

92 ] = __.asyncio.sleep 

93 ) -> None: 

94 if not __.is_absent( error_ttl ): self.error_ttl = error_ttl 

95 if not __.is_absent( success_ttl ): self.success_ttl = success_ttl 

96 self.delay_function = delay_function 

97 self._request_mutexes: dict[ str, __.asyncio.Lock ] = { } 

98 

99 @__.ctxl.asynccontextmanager 

100 async def acquire_mutex_for( self, url: str ): 

101 ''' Acquires mutex for HTTP request deduplication. ''' 

102 if url not in self._request_mutexes: # pragma: no branch 

103 self._request_mutexes[ url ] = __.asyncio.Lock( ) 

104 mutex = self._request_mutexes[ url ] 

105 async with mutex: 

106 try: yield 

107 finally: self._request_mutexes.pop( url, None ) 

108 

109 

110class RobotsCache( Cache ): 

111 ''' Cache manager for robots.txt files with crawl delay tracking. ''' 

112 

113 entries_max: int = 500 

114 request_timeout: float = 5.0 

115 ttl: float = 3600.0 

116 user_agent: str = '*' 

117 

118 def __init__( 

119 self, *, 

120 entries_max: __.Absential[ int ] = __.absent, 

121 ttl: __.Absential[ float ] = __.absent, 

122 request_timeout: __.Absential[ float ] = __.absent, 

123 user_agent: __.Absential[ str ] = __.absent, 

124 **base_initargs: __.typx.Any 

125 ) -> None: 

126 super( ).__init__( **base_initargs ) 

127 if not __.is_absent( entries_max ): self.entries_max = entries_max 

128 if not __.is_absent( ttl ): self.ttl = ttl 

129 if not __.is_absent( request_timeout ): 

130 self.request_timeout = request_timeout 

131 if not __.is_absent( user_agent ): self.user_agent = user_agent 

132 self._cache: dict[ str, RobotsCacheEntry ] = { } 

133 self._recency: __.collections.deque[ str ] = __.collections.deque( ) 

134 self._request_delays: dict[ str, float ] = { } 

135 

136 @classmethod 

137 def from_configuration( 

138 cls, configuration: __.cabc.Mapping[ str, __.typx.Any ] 

139 ) -> __.typx.Self: 

140 ''' Creates RobotsCache instance from application configuration. ''' 

141 cache_config = configuration.get( 'cache', { } ) 

142 robots_ttl = cache_config.get( 'robots-ttl', 3600.0 ) 

143 return cls( ttl = robots_ttl ) 

144 

145 async def access( self, domain: str ) -> __.Absential[ _RobotFileParser ]: 

146 ''' Retrieves cached robots.txt parser if valid. ''' 

147 if domain not in self._cache: return __.absent 

148 entry = self._cache[ domain ] 

149 if entry.invalid: 

150 self._remove( domain ) 

151 return __.absent 

152 self._record_access( domain ) 

153 return entry.response.extract( ) 

154 

155 def assign_delay( self, domain: str, delay_seconds: float ) -> None: 

156 ''' Sets next allowed request time for domain. ''' 

157 self._request_delays[ domain ] = __.time.time( ) + delay_seconds 

158 

159 def calculate_delay_remainder( self, domain: str ) -> float: 

160 ''' Returns remaining crawl delay time for domain. ''' 

161 allow_at = self._request_delays.get( domain, 0.0 ) 

162 if not allow_at: return 0.0 

163 remainder = allow_at - __.time.time( ) 

164 return max( 0.0, remainder ) 

165 

166 def determine_ttl( self, response: RobotsResponse ) -> float: 

167 ''' Determines appropriate TTL based on response type. ''' 

168 if response.is_value( ): return self.ttl 

169 return self.error_ttl 

170 

171 async def store( 

172 self, domain: str, response: RobotsResponse, ttl: float 

173 ) -> None: 

174 ''' Stores robots.txt parser in cache. ''' 

175 entry = RobotsCacheEntry( 

176 response = response, timestamp = __.time.time( ), ttl = ttl ) 

177 self._cache[ domain ] = entry 

178 self._record_access( domain ) 

179 self._evict_by_count( ) 

180 

181 def _evict_by_count( self ) -> None: 

182 ''' Evicts oldest entries when cache exceeds max size. ''' 

183 while ( 

184 len( self._cache ) > self.entries_max 

185 and self._recency 

186 ): 

187 lru_domain = self._recency.popleft( ) 

188 if lru_domain in self._cache: # pragma: no branch 

189 del self._cache[ lru_domain ] 

190 

191 def _record_access( self, domain: str ) -> None: 

192 ''' Updates LRU access order for given domain. ''' 

193 with __.ctxl.suppress( ValueError ): 

194 self._recency.remove( domain ) 

195 self._recency.append( domain ) 

196 

197 def _remove( self, domain: str ) -> None: 

198 ''' Removes entry from cache. ''' 

199 self._cache.pop( domain, None ) 

200 with __.ctxl.suppress( ValueError ): 

201 self._recency.remove( domain ) 

202 

203 

204class ContentCache( Cache, instances_mutables = ( '_memory_total', ) ): 

205 ''' Cache manager for URL content (GET requests) with memory tracking. ''' 

206 

207 memory_max: int = 32 * 1024 * 1024 

208 

209 def __init__( 

210 self, *, 

211 robots_cache: __.Absential[ RobotsCache ] = __.absent, 

212 memory_max: __.Absential[ int ] = __.absent, 

213 **base_initargs: __.typx.Any 

214 ) -> None: 

215 super( ).__init__( **base_initargs ) 

216 if __.is_absent( robots_cache ): 

217 self.robots_cache = RobotsCache( **base_initargs ) 

218 else: self.robots_cache = robots_cache 

219 if not __.is_absent( memory_max ): self.memory_max = memory_max 

220 self._cache: dict[ str, ContentCacheEntry ] = { } 

221 self._memory_total = 0 

222 self._recency: __.collections.deque[ str ] = __.collections.deque( ) 

223 

224 @classmethod 

225 def from_configuration( 

226 cls, 

227 configuration: __.cabc.Mapping[ str, __.typx.Any ], 

228 robots_cache: __.Absential[ RobotsCache ] = __.absent 

229 ) -> __.typx.Self: 

230 ''' Creates ContentCache instance from application configuration. ''' 

231 cache_config = configuration.get( 'cache', { } ) 

232 content_ttl = cache_config.get( 'content-ttl', 300.0 ) 

233 memory_limit = cache_config.get( 'memory-limit', 33554432 ) 

234 nomargs = { 

235 'success_ttl': content_ttl, 

236 'memory_max': memory_limit, 

237 } 

238 if not __.is_absent( robots_cache ): 

239 nomargs[ 'robots_cache' ] = robots_cache 

240 return cls( **nomargs ) 

241 

242 async def access( 

243 self, url: str 

244 ) -> __.Absential[ tuple[ bytes, _httpx.Headers ] ]: 

245 ''' Retrieves cached content if valid. ''' 

246 if url not in self._cache: return __.absent 

247 entry = self._cache[ url ] 

248 if entry.invalid: 

249 self._remove( url ) 

250 return __.absent 

251 self._record_access( url ) 

252 return ( entry.response.extract( ), entry.headers ) 

253 

254 def determine_ttl( self, response: ContentResponse ) -> float: 

255 ''' Determines appropriate TTL based on response type. ''' 

256 if response.is_value( ): 

257 return self.success_ttl 

258 # TODO: Inspect exception type for more granular TTL 

259 return self.error_ttl 

260 

261 async def retrieve_url( 

262 self, 

263 url: _Url, /, *, 

264 duration_max: float = 30.0, 

265 client_factory: HttpClientFactory = _httpx.AsyncClient, 

266 ) -> bytes: 

267 ''' Convenience method for retrieving URL content. ''' 

268 return await retrieve_url( 

269 self, url, 

270 duration_max = duration_max, 

271 client_factory = client_factory ) 

272 

273 async def store( 

274 self, url: str, response: ContentResponse, 

275 headers: _httpx.Headers, ttl: float 

276 ) -> None: 

277 ''' Stores content in cache with memory management. ''' 

278 size_bytes = self._calculate_response_size( response ) 

279 entry = ContentCacheEntry( 

280 response = response, 

281 headers = headers, 

282 timestamp = __.time.time( ), 

283 ttl = ttl, 

284 size_bytes = size_bytes ) 

285 if old_entry := self._cache.get( url ): 

286 self._memory_total -= old_entry.memory_usage 

287 self._cache[ url ] = entry 

288 self._memory_total += entry.memory_usage 

289 self._record_access( url ) 

290 self._evict_by_memory( ) 

291 

292 def _calculate_response_size( self, response: ContentResponse ) -> int: 

293 ''' Calculates memory footprint of cached response. ''' 

294 if response.is_value( ): 

295 content = response.extract( ) 

296 return len( content ) 

297 return 100 # Conservative estimate for exception overhead 

298 

299 def _evict_by_memory( self ) -> None: 

300 ''' Evicts LRU entries until memory usage is under limit. ''' 

301 while ( 

302 self._memory_total > self.memory_max 

303 and self._recency 

304 ): 

305 lru_url = self._recency.popleft( ) 

306 if lru_url in self._cache: # pragma: no branch 

307 entry = self._cache[ lru_url ] 

308 self._memory_total -= entry.memory_usage 

309 del self._cache[ lru_url ] 

310 _scribe.debug( f"Evicted cache entry: {lru_url}" ) 

311 

312 def _record_access( self, url: str ) -> None: 

313 ''' Updates LRU access order for given URL. ''' 

314 with __.ctxl.suppress( ValueError ): 

315 self._recency.remove( url ) 

316 self._recency.append( url ) 

317 

318 def _remove( self, url: str ) -> None: 

319 ''' Removes entry from cache and updates memory tracking. ''' 

320 if entry := self._cache.pop( url, None ): 

321 self._memory_total -= entry.memory_usage 

322 with __.ctxl.suppress( ValueError ): 

323 self._recency.remove( url ) 

324 

325 

326class ProbeCache( Cache ): 

327 ''' Cache manager for URL probe results (HEAD requests). ''' 

328 

329 entries_max: int = 1000 

330 

331 def __init__( 

332 self, *, 

333 robots_cache: __.Absential[ RobotsCache ] = __.absent, 

334 entries_max: __.Absential[ int ] = __.absent, 

335 **base_initargs: __.typx.Any 

336 ) -> None: 

337 super( ).__init__( **base_initargs ) 

338 if __.is_absent( robots_cache ): 

339 self.robots_cache = RobotsCache( **base_initargs ) 

340 else: self.robots_cache = robots_cache 

341 if not __.is_absent( entries_max ): self.entries_max = entries_max 

342 self._cache: dict[ str, ProbeCacheEntry ] = { } 

343 self._recency: __.collections.deque[ str ] = __.collections.deque( ) 

344 

345 @classmethod 

346 def from_configuration( 

347 cls, 

348 configuration: __.cabc.Mapping[ str, __.typx.Any ], 

349 robots_cache: __.Absential[ RobotsCache ] = __.absent 

350 ) -> __.typx.Self: 

351 ''' Creates ProbeCache instance from application configuration. ''' 

352 cache_config = configuration.get( 'cache', { } ) 

353 probe_ttl = cache_config.get( 'probe-ttl', 300.0 ) 

354 nomargs = { 'success_ttl': probe_ttl } 

355 if not __.is_absent( robots_cache ): 

356 nomargs[ 'robots_cache' ] = robots_cache 

357 return cls( **nomargs ) 

358 

359 async def access( self, url: str ) -> __.Absential[ bool ]: 

360 ''' Retrieves cached probe result if valid. ''' 

361 if url not in self._cache: return __.absent 

362 entry = self._cache[ url ] 

363 if entry.invalid: 

364 self._remove( url ) 

365 return __.absent 

366 self._record_access( url ) 

367 return entry.response.extract( ) 

368 

369 def determine_ttl( self, response: ProbeResponse ) -> float: 

370 ''' Determines appropriate TTL based on response type. ''' 

371 if response.is_value( ): 

372 return self.success_ttl 

373 # TODO: Inspect exception type for more granular TTL 

374 return self.error_ttl 

375 

376 async def probe_url( 

377 self, 

378 url: _Url, /, *, 

379 duration_max: float = 10.0, 

380 client_factory: HttpClientFactory = _httpx.AsyncClient, 

381 ) -> bool: 

382 ''' Convenience method for probing URL existence. ''' 

383 return await probe_url( 

384 self, url, 

385 duration_max = duration_max, 

386 client_factory = client_factory ) 

387 

388 async def store( 

389 self, url: str, response: ProbeResponse, ttl: float 

390 ) -> None: 

391 ''' Stores probe result in cache. ''' 

392 entry = ProbeCacheEntry( 

393 response = response, 

394 timestamp = __.time.time( ), 

395 ttl = ttl ) 

396 self._cache[ url ] = entry 

397 self._record_access( url ) 

398 self._evict_by_count( ) 

399 

400 def _evict_by_count( self ) -> None: 

401 ''' Evicts oldest entries when cache exceeds max size. ''' 

402 while ( 

403 len( self._cache ) > self.entries_max 

404 and self._recency 

405 ): 

406 lru_url = self._recency.popleft( ) 

407 if lru_url in self._cache: # pragma: no branch 

408 del self._cache[ lru_url ] 

409 

410 def _record_access( self, url: str ) -> None: 

411 ''' Updates LRU access order for given URL. ''' 

412 with __.ctxl.suppress( ValueError ): 

413 self._recency.remove( url ) 

414 self._recency.append( url ) 

415 

416 def _remove( self, url: str ) -> None: 

417 ''' Removes entry from cache. ''' 

418 self._cache.pop( url, None ) 

419 with __.ctxl.suppress( ValueError ): 

420 self._recency.remove( url ) 

421 

422 

423_http_success_threshold = 400 

424 

425 

426_scribe = __.acquire_scribe( __name__ ) 

427 

428 

429def prepare( 

430 auxdata: __.Globals 

431) -> tuple[ ContentCache, ProbeCache, RobotsCache ]: 

432 ''' Prepares cache instances from configuration. 

433 

434 Returns cache instances constructed from application configuration. 

435 ''' 

436 configuration = auxdata.configuration 

437 robots_cache = RobotsCache.from_configuration( configuration ) 

438 return ( 

439 ContentCache.from_configuration( configuration, robots_cache ), 

440 ProbeCache.from_configuration( configuration, robots_cache ), 

441 robots_cache, 

442 ) 

443 

444 

445async def probe_url( 

446 cache: ProbeCache, 

447 url: _Url, *, 

448 duration_max: float = 10.0, 

449 client_factory: HttpClientFactory = _httpx.AsyncClient, 

450) -> bool: 

451 ''' Cached HEAD request to check URL existence. ''' 

452 url_s = url.geturl( ) 

453 match url.scheme: 

454 case '' | 'file': 

455 return __.Path( url.path ).exists( ) 

456 case 'http' | 'https': 

457 result = await cache.access( url_s ) 

458 if not __.is_absent( result ): return result 

459 async with client_factory( ) as client: 

460 result = await _probe_url( 

461 url, duration_max = duration_max, 

462 client = client, 

463 probe_cache = cache, 

464 robots_cache = cache.robots_cache ) 

465 ttl = cache.determine_ttl( result ) 

466 await cache.store( url_s, result, ttl ) 

467 return result.extract( ) 

468 case _: return False 

469 

470 

471async def retrieve_url( 

472 cache: ContentCache, 

473 url: _Url, *, 

474 duration_max: float = 30.0, 

475 client_factory: HttpClientFactory = _httpx.AsyncClient, 

476) -> bytes: 

477 ''' Cached GET request to fetch URL content as bytes. ''' 

478 url_s = url.geturl( ) 

479 match url.scheme: 

480 case '' | 'file': 

481 location = __.Path( url.path ) 

482 try: return location.read_bytes( ) 

483 except Exception as exc: 

484 raise _exceptions.DocumentationInaccessibility( 

485 url_s, exc ) from exc 

486 case 'http' | 'https': 

487 result = await cache.access( url_s ) 

488 if not __.is_absent( result ): 

489 content_bytes, _ = result 

490 return content_bytes 

491 async with client_factory( ) as client: 

492 result, headers = await _retrieve_url( 

493 url, 

494 duration_max = duration_max, 

495 client = client, 

496 content_cache = cache, 

497 robots_cache = cache.robots_cache ) 

498 ttl = cache.determine_ttl( result ) 

499 await cache.store( url_s, result, headers, ttl ) 

500 return result.extract( ) 

501 case _: 

502 raise _exceptions.DocumentationInaccessibility( 

503 url_s, f"Unsupported scheme: {url.scheme}" ) 

504 

505 

506async def retrieve_url_as_text( 

507 cache: ContentCache, 

508 url: _Url, *, 

509 duration_max: float = 30.0, 

510 charset_default: str = 'utf-8', 

511 client_factory: HttpClientFactory = _httpx.AsyncClient, 

512) -> str: 

513 ''' Cached GET request to fetch URL content as text. ''' 

514 url_s = url.geturl( ) 

515 match url.scheme: 

516 case '' | 'file': 

517 location = __.Path( url.path ) 

518 try: content_bytes = location.read_bytes( ) 

519 except Exception as exc: 

520 raise _exceptions.DocumentationInaccessibility( 

521 url_s, exc ) from exc 

522 mimetype, charset = __.detext.detect_mimetype_and_charset( 

523 content_bytes, location ) 

524 if not __.detext.is_textual_content( content_bytes ): 524 ↛ 525line 524 didn't jump to line 525 because the condition on line 524 was never true

525 raise _exceptions.DocumentationInaccessibility( 

526 url_s, "Content analysis indicates non-textual data" ) 

527 encoding = charset or charset_default 

528 return content_bytes.decode( encoding ) 

529 case 'http' | 'https': 

530 result = await cache.access( url_s ) 

531 if not __.is_absent( result ): 

532 content_bytes, headers = result 

533 _validate_textual_content( 

534 content_bytes, headers, url_s ) 

535 charset = _detect_charset_with_fallback( 

536 content_bytes, headers, charset_default ) 

537 return content_bytes.decode( charset ) 

538 async with client_factory( ) as client: 

539 result, headers = await _retrieve_url( 

540 url, duration_max = duration_max, 

541 client = client, 

542 content_cache = cache, 

543 robots_cache = cache.robots_cache ) 

544 ttl = cache.determine_ttl( result ) 

545 await cache.store( url_s, result, headers, ttl ) 

546 content_bytes = result.extract( ) 

547 _validate_textual_content( 

548 content_bytes, headers, url_s ) 

549 charset = _detect_charset_with_fallback( 

550 content_bytes, headers, charset_default ) 

551 return content_bytes.decode( charset ) 

552 case _: 

553 raise _exceptions.DocumentationInaccessibility( 

554 url_s, f"Unsupported scheme: {url.scheme}" ) 

555 

556 

557async def _apply_request_delay( 

558 url: _Url, 

559 client: _httpx.AsyncClient, 

560 cache: RobotsCache, 

561) -> None: 

562 ''' Applies crawl delay to request if specified in robots.txt. ''' 

563 if url.scheme not in ( 'http', 'https' ): return 563 ↛ exitline 563 didn't return from function '_apply_request_delay' because the return on line 563 wasn't executed

564 domain = _extract_domain( url ) 

565 delay = cache.calculate_delay_remainder( domain ) 

566 if delay > 0: await cache.delay_function( delay ) 

567 parser = await cache.access( domain ) 

568 if __.is_absent( parser ): 568 ↛ 569line 568 didn't jump to line 569 because the condition on line 568 was never true

569 parser = await _retrieve_robots_txt( client, cache, domain ) 

570 if not __.is_absent( parser ): 570 ↛ exitline 570 didn't return from function '_apply_request_delay' because the condition on line 570 was always true

571 try: delay = parser.crawl_delay( cache.user_agent ) 

572 except Exception as exc: 

573 _scribe.debug( f"Failed to get crawl delay for {domain}: {exc}" ) 

574 else: 

575 if delay: cache.assign_delay( domain, float( delay ) ) 

576 

577 

578async def _cache_robots_txt_error( 

579 domain: str, cache: RobotsCache, error: Exception 

580) -> __.Absential[ _RobotFileParser ]: 

581 _scribe.debug( f"Failed to fetch/parse robots.txt from {domain}: {error}" ) 

582 result: RobotsResponse = _generics.Error( error ) 

583 return await _cache_robots_txt_result( cache, domain, result ) 

584 

585 

586async def _cache_robots_txt_result( 

587 cache: RobotsCache, domain: str, result: RobotsResponse 

588) -> __.Absential[ _RobotFileParser ]: 

589 ttl = cache.determine_ttl( result ) 

590 await cache.store( domain, result, ttl ) 

591 return result.extract( ) if result.is_value( ) else __.absent 

592 

593 

594async def _check_robots_txt( 

595 url: _Url, *, 

596 client: _httpx.AsyncClient, 

597 cache: RobotsCache, 

598) -> bool: 

599 ''' Checks if URL is allowed by robots.txt. ''' 

600 if url.scheme not in ( 'http', 'https' ): return True 600 ↛ exitline 600 didn't return from function '_check_robots_txt' because the return on line 600 wasn't executed

601 url_s = url.geturl( ) 

602 domain = _extract_domain( url ) 

603 parser = await cache.access( domain ) 

604 if __.is_absent( parser ): 604 ↛ 607line 604 didn't jump to line 607 because the condition on line 604 was always true

605 parser = await _retrieve_robots_txt( client, cache, domain ) 

606 if __.is_absent( parser ): return True 

607 try: return parser.can_fetch( cache.user_agent, url_s ) 

608 except Exception as exc: 

609 _scribe.debug( f"robots.txt check failed for {url_s}: {exc}" ) 

610 return True # if no robots.txt, then assume URL allowed 

611 

612 

613def _detect_charset_with_fallback( 

614 content: bytes, headers: _httpx.Headers, default: str 

615) -> str: 

616 ''' Detects charset from headers with content-based fallback. ''' 

617 header_charset = _extract_charset_from_headers( headers, '' ) 

618 if header_charset: 

619 return header_charset 

620 detected_charset = __.detext.detect_charset( content ) 

621 return detected_charset or default 

622 

623 

624def _detect_mimetype_with_fallback( 

625 content: bytes, headers: _httpx.Headers, url: str 

626) -> str: 

627 ''' Detects MIME type from headers with content-based fallback. ''' 

628 header_mimetype = _extract_mimetype_from_headers( headers ) 

629 if header_mimetype: 629 ↛ 631line 629 didn't jump to line 631 because the condition on line 629 was always true

630 return header_mimetype 

631 return __.detext.detect_mimetype( content, url ) or '' 

632 

633 

634def _extract_charset_from_headers( 

635 headers: _httpx.Headers, default: str 

636) -> str: 

637 ''' Extracts charset from Content-Type header. ''' 

638 content_type = headers.get( 'content-type', '' ) 

639 if isinstance( content_type, str ) and ';' in content_type: 

640 _, _, params = content_type.partition( ';' ) 

641 if 'charset=' in params: 

642 charset = params.split( 'charset=' )[ -1 ].strip( ) 

643 return charset.strip( '"\\\'\"' ) 

644 return default 

645 

646 

647def _extract_domain( url: _Url ) -> str: 

648 ''' Extracts domain from URL for robots.txt caching. ''' 

649 return f"{url.scheme}://{url.netloc}" 

650 

651 

652def _extract_mimetype_from_headers( headers: _httpx.Headers ) -> str: 

653 ''' Extracts mimetype from Content-Type header. ''' 

654 content_type = headers.get( 'content-type', '' ) 

655 if isinstance( content_type, str ) and ';' in content_type: 

656 mimetype, _, _ = content_type.partition( ';' ) 

657 return mimetype.strip( ) 

658 return content_type 

659 

660 

661async def _probe_url( 

662 url: _Url, /, *, 

663 duration_max: float, 

664 client: _httpx.AsyncClient, 

665 probe_cache: ProbeCache, 

666 robots_cache: RobotsCache, 

667) -> ProbeResponse: 

668 ''' Makes HEAD request with deduplication. ''' 

669 url_s = url.geturl( ) 

670 if not await _check_robots_txt( 670 ↛ 673line 670 didn't jump to line 673 because the condition on line 670 was never true

671 url, client = client, cache = robots_cache 

672 ): 

673 _scribe.debug( f"URL blocked by robots.txt: {url_s}" ) 

674 return _generics.Error( _exceptions.UrlImpermissibility( 

675 url_s, robots_cache.user_agent ) ) 

676 await _apply_request_delay( url, cache = robots_cache, client = client ) 

677 async with probe_cache.acquire_mutex_for( url_s ): 

678 try: 

679 response = await client.head( 

680 url_s, timeout = duration_max, follow_redirects = True ) 

681 except Exception as exc: 

682 _scribe.debug( f"HEAD request failed for {url_s}: {exc}" ) 

683 return _generics.Error( exc ) 

684 else: 

685 return _generics.Value( 

686 response.status_code < _http_success_threshold ) 

687 

688 

689async def _retrieve_robots_txt( 

690 client: _httpx.AsyncClient, cache: RobotsCache, domain: str 

691) -> __.Absential[ _RobotFileParser ]: 

692 ''' Fetches and parses robots.txt for domain. ''' 

693 robots_url = f"{domain}/robots.txt" 

694 async with cache.acquire_mutex_for( domain ): 

695 timeout = cache.request_timeout 

696 try: 

697 response = await client.get( 

698 robots_url, timeout = timeout, follow_redirects = True ) 

699 except Exception as exc: 

700 return await _cache_robots_txt_error( domain, cache, exc ) 

701 match response.status_code: 

702 case _HttpStatus.OK: lines = response.text.splitlines( ) 702 ↛ 703line 702 didn't jump to line 703 because the pattern on line 702 always matched

703 case _HttpStatus.NOT_FOUND: lines = [ ] 

704 case _: 

705 try: response.raise_for_status( ) 

706 except Exception as exc: 

707 return await _cache_robots_txt_error( domain, cache, exc ) 

708 robots_parser = _RobotFileParser( ) 

709 robots_parser.set_url( robots_url ) 

710 try: robots_parser.parse( lines ) 

711 except Exception as exc: 

712 return await _cache_robots_txt_error( domain, cache, exc ) 

713 result: RobotsResponse = _generics.Value( robots_parser ) 

714 return await _cache_robots_txt_result( cache, domain, result ) 

715 

716 

717async def _retrieve_url( 

718 url: _Url, /, *, 

719 duration_max: float, 

720 client: _httpx.AsyncClient, 

721 content_cache: ContentCache, 

722 robots_cache: RobotsCache, 

723) -> tuple[ ContentResponse, _httpx.Headers ]: 

724 ''' Makes GET request with deduplication. ''' 

725 url_s = url.geturl( ) 

726 if not await _check_robots_txt( 726 ↛ 729line 726 didn't jump to line 729 because the condition on line 726 was never true

727 url, cache = robots_cache, client = client 

728 ): 

729 return ( 

730 _generics.Error( _exceptions.UrlImpermissibility( 

731 url_s, robots_cache.user_agent ) ), 

732 _httpx.Headers( ) ) 

733 await _apply_request_delay( url, cache = robots_cache, client = client ) 

734 async with content_cache.acquire_mutex_for( url_s ): 

735 try: 

736 response = await client.get( 

737 url_s, timeout = duration_max, follow_redirects = True ) 

738 response.raise_for_status( ) 

739 except Exception as exc: 

740 _scribe.debug( f"GET request failed for {url_s}: {exc}" ) 

741 return _generics.Error( exc ), _httpx.Headers( ) 

742 else: return _generics.Value( response.content ), response.headers 

743 

744 

745def _validate_textual_content( 

746 content: bytes, headers: _httpx.Headers, url: str 

747) -> None: 

748 ''' Validates that content is textual via headers and content analysis. ''' 

749 mimetype = _detect_mimetype_with_fallback( content, headers, url ) 

750 if mimetype and not __.detext.is_textual_mimetype( mimetype ): 

751 raise _exceptions.HttpContentTypeInvalidity( 

752 url, mimetype, "text decoding" ) 

753 if not __.detext.is_textual_content( content ): 753 ↛ 754line 753 didn't jump to line 754 because the condition on line 753 was never true

754 raise _exceptions.HttpContentTypeInvalidity( 

755 url, mimetype or 'unknown', "content analysis" )