Coverage for sources/librovore/cacheproxy.py: 87%

404 statements  

« prev     ^ index     » next       coverage.py v7.11.0, created at 2025-10-20 18:40 +0000

1# vim: set filetype=python fileencoding=utf-8: 

2# -*- coding: utf-8 -*- 

3 

4#============================================================================# 

5# # 

6# Licensed under the Apache License, Version 2.0 (the "License"); # 

7# you may not use this file except in compliance with the License. # 

8# You may obtain a copy of the License at # 

9# # 

10# http://www.apache.org/licenses/LICENSE-2.0 # 

11# # 

12# Unless required by applicable law or agreed to in writing, software # 

13# distributed under the License is distributed on an "AS IS" BASIS, # 

14# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # 

15# See the License for the specific language governing permissions and # 

16# limitations under the License. # 

17# # 

18#============================================================================# 

19 

20 

21''' HTTP cache for documentation URL access. ''' 

22 

23 

24from http import HTTPStatus as _HttpStatus 

25from urllib.parse import ParseResult as _Url 

26from urllib.robotparser import RobotFileParser as _RobotFileParser 

27 

28import appcore.generics as _generics 

29import httpx as _httpx 

30 

31from . import __ 

32from . import exceptions as _exceptions 

33 

34 

35HttpClientFactory: __.typx.TypeAlias = ( 

36 __.cabc.Callable[ [ ], _httpx.AsyncClient ] ) 

37ContentResponse: __.typx.TypeAlias = _generics.Result[ bytes, Exception ] 

38ProbeResponse: __.typx.TypeAlias = _generics.Result[ bool, Exception ] 

39RobotsResponse: __.typx.TypeAlias = ( 

40 _generics.Result[ _RobotFileParser, Exception ] ) 

41 

42 

43class CacheEntry( __.immut.DataclassObject ): 

44 ''' Cache entry base. ''' 

45 

46 timestamp: float 

47 ttl: float 

48 

49 @property 

50 def invalid( self ) -> bool: 

51 ''' Checks if cache entry has exceeded its TTL. ''' 

52 return __.time.time( ) - self.timestamp > self.ttl 

53 

54 

55class ContentCacheEntry( CacheEntry ): 

56 ''' Cache entry for URL content with size tracking. ''' 

57 

58 response: ContentResponse 

59 headers: _httpx.Headers 

60 size_bytes: int 

61 

62 @property 

63 def memory_usage( self ) -> int: 

64 ''' Calculates total memory usage including metadata. ''' 

65 return self.size_bytes + 100 # Overhead estimate 

66 

67 

68class ProbeCacheEntry( CacheEntry ): 

69 ''' Cache entry for URL probe results. ''' 

70 

71 response: ProbeResponse 

72 

73 

74class RobotsCacheEntry( CacheEntry ): 

75 ''' Cache entry for robots.txt parser. ''' 

76 

77 response: RobotsResponse 

78 

79 

80class Cache( __.immut.Object ): 

81 ''' Cache base with shared configuration attributes. ''' 

82 

83 error_ttl: float = 30.0 

84 success_ttl: float = 300.0 

85 

86 def __init__( 

87 self, *, 

88 error_ttl: __.Absential[ float ] = __.absent, 

89 success_ttl: __.Absential[ float ] = __.absent, 

90 delay_function: __.cabc.Callable[ 

91 [ float ], __.cabc.Awaitable[ None ] 

92 ] = __.asyncio.sleep 

93 ) -> None: 

94 if not __.is_absent( error_ttl ): self.error_ttl = error_ttl 

95 if not __.is_absent( success_ttl ): self.success_ttl = success_ttl 

96 self.delay_function = delay_function 

97 self._request_mutexes: dict[ str, __.asyncio.Lock ] = { } 

98 

99 @__.ctxl.asynccontextmanager 

100 async def acquire_mutex_for( self, url: str ): 

101 ''' Acquires mutex for HTTP request deduplication. ''' 

102 if url not in self._request_mutexes: # pragma: no branch 

103 self._request_mutexes[ url ] = __.asyncio.Lock( ) 

104 mutex = self._request_mutexes[ url ] 

105 async with mutex: 

106 try: yield 

107 finally: self._request_mutexes.pop( url, None ) 

108 

109 

110class RobotsCache( Cache ): 

111 ''' Cache manager for robots.txt files with crawl delay tracking. ''' 

112 

113 entries_max: int = 500 

114 request_timeout: float = 5.0 

115 ttl: float = 3600.0 

116 user_agent: str = '*' 

117 def __init__( 

118 self, *, 

119 entries_max: __.Absential[ int ] = __.absent, 

120 ttl: __.Absential[ float ] = __.absent, 

121 request_timeout: __.Absential[ float ] = __.absent, 

122 user_agent: __.Absential[ str ] = __.absent, 

123 **base_initargs: __.typx.Any 

124 ) -> None: 

125 super( ).__init__( **base_initargs ) 

126 if not __.is_absent( entries_max ): self.entries_max = entries_max 

127 if not __.is_absent( ttl ): self.ttl = ttl 

128 if not __.is_absent( request_timeout ): 

129 self.request_timeout = request_timeout 

130 if not __.is_absent( user_agent ): self.user_agent = user_agent 

131 self._cache: dict[ str, RobotsCacheEntry ] = { } 

132 self._recency: __.collections.deque[ str ] = __.collections.deque( ) 

133 self._request_delays: dict[ str, float ] = { } 

134 

135 @classmethod 

136 def from_configuration( 

137 cls, configuration: __.cabc.Mapping[ str, __.typx.Any ] 

138 ) -> __.typx.Self: 

139 ''' Creates RobotsCache instance from application configuration. ''' 

140 cache_config = configuration.get( 'cache', { } ) 

141 robots_ttl = cache_config.get( 'robots-ttl', 3600.0 ) 

142 return cls( ttl = robots_ttl ) 

143 

144 async def access( 

145 self, client: _httpx.AsyncClient, domain: str, # TODO: retriever 

146 ) -> _RobotFileParser: 

147 ''' Retrieves cached robots.txt parser if valid. ''' 

148 if domain not in self._cache: 

149 await _retrieve_robots_txt( client, self, domain ) 

150 entry = self._cache[ domain ] 

151 if entry.invalid: 151 ↛ 152line 151 didn't jump to line 152 because the condition on line 151 was never true

152 self._remove( domain ) 

153 await _retrieve_robots_txt( client, self, domain ) 

154 entry = self._cache[ domain ] 

155 self._record_access( domain ) 

156 return entry.response.extract( ) 

157 

158 def assign_delay( self, domain: str, delay_seconds: float ) -> None: 

159 ''' Sets next allowed request time for domain. ''' 

160 self._request_delays[ domain ] = __.time.time( ) + delay_seconds 

161 

162 def calculate_delay_remainder( self, domain: str ) -> float: 

163 ''' Returns remaining crawl delay time for domain. ''' 

164 allow_at = self._request_delays.get( domain, 0.0 ) 

165 if not allow_at: return 0.0 

166 remainder = allow_at - __.time.time( ) 

167 return max( 0.0, remainder ) 

168 

169 def determine_ttl( self, response: RobotsResponse ) -> float: 

170 ''' Determines appropriate TTL based on response type. ''' 

171 if response.is_value( ): return self.ttl 

172 return self.error_ttl 

173 

174 async def store( 

175 self, domain: str, response: RobotsResponse, ttl: float 

176 ) -> None: 

177 ''' Stores robots.txt parser in cache. ''' 

178 entry = RobotsCacheEntry( 

179 response = response, timestamp = __.time.time( ), ttl = ttl ) 

180 self._cache[ domain ] = entry 

181 self._record_access( domain ) 

182 self._evict_by_count( ) 

183 

184 def _evict_by_count( self ) -> None: 

185 ''' Evicts oldest entries when cache exceeds max size. ''' 

186 while ( 

187 len( self._cache ) > self.entries_max 

188 and self._recency 

189 ): 

190 lru_domain = self._recency.popleft( ) 

191 if lru_domain in self._cache: # pragma: no branch 

192 del self._cache[ lru_domain ] 

193 

194 def _record_access( self, domain: str ) -> None: 

195 ''' Updates LRU access order for given domain. ''' 

196 with __.ctxl.suppress( ValueError ): 

197 self._recency.remove( domain ) 

198 self._recency.append( domain ) 

199 

200 def _remove( self, domain: str ) -> None: 

201 ''' Removes entry from cache. ''' 

202 self._cache.pop( domain, None ) 

203 with __.ctxl.suppress( ValueError ): 

204 self._recency.remove( domain ) 

205 

206 

207class ContentCache( Cache, instances_mutables = ( '_memory_total', ) ): 

208 ''' Cache manager for URL content (GET requests) with memory tracking. ''' 

209 

210 memory_max: int = 32 * 1024 * 1024 

211 

212 def __init__( 

213 self, *, 

214 robots_cache: __.Absential[ RobotsCache ] = __.absent, 

215 memory_max: __.Absential[ int ] = __.absent, 

216 **base_initargs: __.typx.Any 

217 ) -> None: 

218 super( ).__init__( **base_initargs ) 

219 if __.is_absent( robots_cache ): 

220 self.robots_cache = RobotsCache( **base_initargs ) 

221 else: self.robots_cache = robots_cache 

222 if not __.is_absent( memory_max ): self.memory_max = memory_max 

223 self._cache: dict[ str, ContentCacheEntry ] = { } 

224 self._memory_total = 0 

225 self._recency: __.collections.deque[ str ] = __.collections.deque( ) 

226 

227 @classmethod 

228 def from_configuration( 

229 cls, 

230 configuration: __.cabc.Mapping[ str, __.typx.Any ], 

231 robots_cache: __.Absential[ RobotsCache ] = __.absent 

232 ) -> __.typx.Self: 

233 ''' Creates ContentCache instance from application configuration. ''' 

234 cache_config = configuration.get( 'cache', { } ) 

235 content_ttl = cache_config.get( 'content-ttl', 300.0 ) 

236 memory_limit = cache_config.get( 'memory-limit', 33554432 ) 

237 nomargs = { 

238 'success_ttl': content_ttl, 

239 'memory_max': memory_limit, 

240 } 

241 if not __.is_absent( robots_cache ): 

242 nomargs[ 'robots_cache' ] = robots_cache 

243 return cls( **nomargs ) 

244 

245 async def access( 

246 self, url: str 

247 ) -> __.Absential[ tuple[ bytes, _httpx.Headers ] ]: 

248 ''' Retrieves cached content if valid. ''' 

249 if url not in self._cache: return __.absent 

250 entry = self._cache[ url ] 

251 if entry.invalid: 

252 self._remove( url ) 

253 return __.absent 

254 self._record_access( url ) 

255 return ( entry.response.extract( ), entry.headers ) 

256 

257 def determine_ttl( self, response: ContentResponse ) -> float: 

258 ''' Determines appropriate TTL based on response type. ''' 

259 if response.is_value( ): 

260 return self.success_ttl 

261 # TODO: Inspect exception type for more granular TTL 

262 return self.error_ttl 

263 

264 async def retrieve_url( 

265 self, 

266 url: _Url, /, *, 

267 duration_max: float = 30.0, 

268 client_factory: HttpClientFactory = _httpx.AsyncClient, 

269 ) -> bytes: 

270 ''' Convenience method for retrieving URL content. ''' 

271 return await retrieve_url( 

272 self, url, 

273 duration_max = duration_max, 

274 client_factory = client_factory ) 

275 

276 async def store( 

277 self, url: str, response: ContentResponse, 

278 headers: _httpx.Headers, ttl: float 

279 ) -> None: 

280 ''' Stores content in cache with memory management. ''' 

281 size_bytes = self._calculate_response_size( response ) 

282 entry = ContentCacheEntry( 

283 response = response, 

284 headers = headers, 

285 timestamp = __.time.time( ), 

286 ttl = ttl, 

287 size_bytes = size_bytes ) 

288 if old_entry := self._cache.get( url ): 

289 self._memory_total -= old_entry.memory_usage 

290 self._cache[ url ] = entry 

291 self._memory_total += entry.memory_usage 

292 self._record_access( url ) 

293 self._evict_by_memory( ) 

294 

295 def _calculate_response_size( self, response: ContentResponse ) -> int: 

296 ''' Calculates memory footprint of cached response. ''' 

297 if response.is_value( ): 

298 content = response.extract( ) 

299 return len( content ) 

300 return 100 # Conservative estimate for exception overhead 

301 

302 def _evict_by_memory( self ) -> None: 

303 ''' Evicts LRU entries until memory usage is under limit. ''' 

304 while ( 

305 self._memory_total > self.memory_max 

306 and self._recency 

307 ): 

308 lru_url = self._recency.popleft( ) 

309 if lru_url in self._cache: # pragma: no branch 

310 entry = self._cache[ lru_url ] 

311 self._memory_total -= entry.memory_usage 

312 del self._cache[ lru_url ] 

313 _scribe.debug( f"Evicted cache entry: {lru_url}" ) 

314 

315 def _record_access( self, url: str ) -> None: 

316 ''' Updates LRU access order for given URL. ''' 

317 with __.ctxl.suppress( ValueError ): 

318 self._recency.remove( url ) 

319 self._recency.append( url ) 

320 

321 def _remove( self, url: str ) -> None: 

322 ''' Removes entry from cache and updates memory tracking. ''' 

323 if entry := self._cache.pop( url, None ): 

324 self._memory_total -= entry.memory_usage 

325 with __.ctxl.suppress( ValueError ): 

326 self._recency.remove( url ) 

327 

328 

329class ProbeCache( Cache ): 

330 ''' Cache manager for URL probe results (HEAD requests). ''' 

331 

332 entries_max: int = 1000 

333 

334 def __init__( 

335 self, *, 

336 robots_cache: __.Absential[ RobotsCache ] = __.absent, 

337 entries_max: __.Absential[ int ] = __.absent, 

338 **base_initargs: __.typx.Any 

339 ) -> None: 

340 super( ).__init__( **base_initargs ) 

341 if __.is_absent( robots_cache ): 

342 self.robots_cache = RobotsCache( **base_initargs ) 

343 else: self.robots_cache = robots_cache 

344 if not __.is_absent( entries_max ): self.entries_max = entries_max 

345 self._cache: dict[ str, ProbeCacheEntry ] = { } 

346 self._recency: __.collections.deque[ str ] = __.collections.deque( ) 

347 

348 @classmethod 

349 def from_configuration( 

350 cls, 

351 configuration: __.cabc.Mapping[ str, __.typx.Any ], 

352 robots_cache: __.Absential[ RobotsCache ] = __.absent 

353 ) -> __.typx.Self: 

354 ''' Creates ProbeCache instance from application configuration. ''' 

355 cache_config = configuration.get( 'cache', { } ) 

356 probe_ttl = cache_config.get( 'probe-ttl', 300.0 ) 

357 nomargs = { 'success_ttl': probe_ttl } 

358 if not __.is_absent( robots_cache ): 

359 nomargs[ 'robots_cache' ] = robots_cache 

360 return cls( **nomargs ) 

361 

362 async def access( self, url: str ) -> __.Absential[ bool ]: 

363 ''' Retrieves cached probe result if valid. ''' 

364 if url not in self._cache: return __.absent 

365 entry = self._cache[ url ] 

366 if entry.invalid: 

367 self._remove( url ) 

368 return __.absent 

369 self._record_access( url ) 

370 return entry.response.extract( ) 

371 

372 def determine_ttl( self, response: ProbeResponse ) -> float: 

373 ''' Determines appropriate TTL based on response type. ''' 

374 if response.is_value( ): 

375 return self.success_ttl 

376 # TODO: Inspect exception type for more granular TTL 

377 return self.error_ttl 

378 

379 async def probe_url( 

380 self, 

381 url: _Url, /, *, 

382 duration_max: float = 10.0, 

383 client_factory: HttpClientFactory = _httpx.AsyncClient, 

384 ) -> bool: 

385 ''' Convenience method for probing URL existence. ''' 

386 return await probe_url( 

387 self, url, 

388 duration_max = duration_max, 

389 client_factory = client_factory ) 

390 

391 async def store( 

392 self, url: str, response: ProbeResponse, ttl: float 

393 ) -> None: 

394 ''' Stores probe result in cache. ''' 

395 entry = ProbeCacheEntry( 

396 response = response, 

397 timestamp = __.time.time( ), 

398 ttl = ttl ) 

399 self._cache[ url ] = entry 

400 self._record_access( url ) 

401 self._evict_by_count( ) 

402 

403 def _evict_by_count( self ) -> None: 

404 ''' Evicts oldest entries when cache exceeds max size. ''' 

405 while ( 

406 len( self._cache ) > self.entries_max 

407 and self._recency 

408 ): 

409 lru_url = self._recency.popleft( ) 

410 if lru_url in self._cache: # pragma: no branch 

411 del self._cache[ lru_url ] 

412 

413 def _record_access( self, url: str ) -> None: 

414 ''' Updates LRU access order for given URL. ''' 

415 with __.ctxl.suppress( ValueError ): 

416 self._recency.remove( url ) 

417 self._recency.append( url ) 

418 

419 def _remove( self, url: str ) -> None: 

420 ''' Removes entry from cache. ''' 

421 self._cache.pop( url, None ) 

422 with __.ctxl.suppress( ValueError ): 

423 self._recency.remove( url ) 

424 

425 

426_http_success_threshold = 400 

427 

428 

429_scribe = __.acquire_scribe( __name__ ) 

430 

431 

432def prepare( 

433 auxdata: __.Globals 

434) -> tuple[ ContentCache, ProbeCache, RobotsCache ]: 

435 ''' Prepares cache instances from configuration. 

436 

437 Returns cache instances constructed from application configuration. 

438 ''' 

439 configuration = auxdata.configuration 

440 robots_cache = RobotsCache.from_configuration( configuration ) 

441 return ( 

442 ContentCache.from_configuration( configuration, robots_cache ), 

443 ProbeCache.from_configuration( configuration, robots_cache ), 

444 robots_cache, 

445 ) 

446 

447 

448async def probe_url( 

449 cache: ProbeCache, 

450 url: _Url, *, 

451 duration_max: float = 10.0, 

452 client_factory: HttpClientFactory = _httpx.AsyncClient, 

453) -> bool: 

454 ''' Cached HEAD request to check URL existence. ''' 

455 url_s = url.geturl( ) 

456 match url.scheme: 

457 case '' | 'file': 

458 return __.Path( url.path ).exists( ) 

459 case 'http' | 'https': 

460 result = await cache.access( url_s ) 

461 if not __.is_absent( result ): return result 

462 async with client_factory( ) as client: 

463 result = await _probe_url( 

464 url, duration_max = duration_max, 

465 client = client, 

466 probe_cache = cache, 

467 robots_cache = cache.robots_cache ) 

468 ttl = cache.determine_ttl( result ) 

469 await cache.store( url_s, result, ttl ) 

470 return result.extract( ) 

471 case _: return False 

472 

473 

474async def retrieve_url( 

475 cache: ContentCache, 

476 url: _Url, *, 

477 duration_max: float = 30.0, 

478 client_factory: HttpClientFactory = _httpx.AsyncClient, 

479) -> bytes: 

480 ''' Cached GET request to fetch URL content as bytes. ''' 

481 url_s = url.geturl( ) 

482 match url.scheme: 

483 case '' | 'file': 

484 location = __.Path( url.path ) 

485 try: return location.read_bytes( ) 

486 except Exception as exc: 

487 raise _exceptions.DocumentationInaccessibility( 

488 url_s, exc ) from exc 

489 case 'http' | 'https': 

490 result = await cache.access( url_s ) 

491 if not __.is_absent( result ): 

492 content_bytes, _ = result 

493 return content_bytes 

494 async with client_factory( ) as client: 

495 result, headers = await _retrieve_url( 

496 url, 

497 duration_max = duration_max, 

498 client = client, 

499 content_cache = cache, 

500 robots_cache = cache.robots_cache ) 

501 ttl = cache.determine_ttl( result ) 

502 await cache.store( url_s, result, headers, ttl ) 

503 return result.extract( ) 

504 case _: 

505 raise _exceptions.DocumentationInaccessibility( 

506 url_s, f"Unsupported scheme: {url.scheme}" ) 

507 

508 

509async def retrieve_url_as_text( 

510 cache: ContentCache, 

511 url: _Url, *, 

512 duration_max: float = 30.0, 

513 charset_default: str = 'utf-8', 

514 client_factory: HttpClientFactory = _httpx.AsyncClient, 

515) -> str: 

516 ''' Cached GET request to fetch URL content as text. ''' 

517 url_s = url.geturl( ) 

518 match url.scheme: 

519 case '' | 'file': 

520 location = __.Path( url.path ) 

521 try: content_bytes = location.read_bytes( ) 

522 except Exception as exc: 

523 raise _exceptions.DocumentationInaccessibility( 

524 url_s, exc ) from exc 

525 _, charset = __.detext.detect_mimetype_and_charset( 

526 content_bytes, location ) 

527 if not __.detext.is_textual_content( content_bytes ): 527 ↛ 528line 527 didn't jump to line 528 because the condition on line 527 was never true

528 raise _exceptions.DocumentationInaccessibility( 

529 url_s, "Content analysis indicates non-textual data" ) 

530 encoding = charset or charset_default 

531 return content_bytes.decode( encoding ) 

532 case 'http' | 'https': 

533 result = await cache.access( url_s ) 

534 if not __.is_absent( result ): 

535 content_bytes, headers = result 

536 _validate_textual_content( 

537 content_bytes, headers, url_s ) 

538 charset = _detect_charset_with_fallback( 

539 content_bytes, headers, charset_default ) 

540 return content_bytes.decode( charset ) 

541 async with client_factory( ) as client: 

542 result, headers = await _retrieve_url( 

543 url, duration_max = duration_max, 

544 client = client, 

545 content_cache = cache, 

546 robots_cache = cache.robots_cache ) 

547 ttl = cache.determine_ttl( result ) 

548 await cache.store( url_s, result, headers, ttl ) 

549 content_bytes = result.extract( ) 

550 _validate_textual_content( 

551 content_bytes, headers, url_s ) 

552 charset = _detect_charset_with_fallback( 

553 content_bytes, headers, charset_default ) 

554 return content_bytes.decode( charset ) 

555 case _: 

556 raise _exceptions.DocumentationInaccessibility( 

557 url_s, f"Unsupported scheme: {url.scheme}" ) 

558 

559 

560async def _apply_request_delay( 

561 url: _Url, 

562 client: _httpx.AsyncClient, 

563 cache: RobotsCache, 

564) -> None: 

565 ''' Applies crawl delay to request if specified in robots.txt. ''' 

566 if url.scheme not in ( 'http', 'https' ): return 566 ↛ exitline 566 didn't return from function '_apply_request_delay' because the return on line 566 wasn't executed

567 domain = _extract_domain( url ) 

568 delay = cache.calculate_delay_remainder( domain ) 

569 if delay > 0: await cache.delay_function( delay ) 

570 try: parser = await cache.access( client, domain ) 

571 except _exceptions.RobotsTxtAccessFailure as exc: 

572 _scribe.debug( 

573 f"robots.txt access failed for {domain}: {exc.cause}. " 

574 f"Skipping crawl delay application." ) 

575 return # Skip crawl delay when robots.txt unavailable 

576 try: delay = parser.crawl_delay( cache.user_agent ) 

577 except Exception as exc: 

578 _scribe.debug( f"Failed to get crawl delay for {domain}: {exc}" ) 

579 else: 

580 if delay: cache.assign_delay( domain, float( delay ) ) 

581 

582 

583async def _cache_robots_txt_error( 

584 domain: str, cache: RobotsCache, error: Exception 

585) -> __.Absential[ _RobotFileParser ]: 

586 _scribe.debug( f"Failed to fetch/parse robots.txt from {domain}: {error}" ) 

587 if isinstance( error, _exceptions.RobotsTxtAccessFailure ): 587 ↛ 588line 587 didn't jump to line 588 because the condition on line 587 was never true

588 result: RobotsResponse = _generics.Error( error ) 

589 else: 

590 access_failure = _exceptions.RobotsTxtAccessFailure( domain, error ) 

591 result = _generics.Error( access_failure ) 

592 return await _cache_robots_txt_result( cache, domain, result ) 

593 

594 

595async def _cache_robots_txt_result( 

596 cache: RobotsCache, domain: str, result: RobotsResponse 

597) -> __.Absential[ _RobotFileParser ]: 

598 ttl = cache.determine_ttl( result ) 

599 await cache.store( domain, result, ttl ) 

600 return result.extract( ) if result.is_value( ) else __.absent 

601 

602 

603async def _check_robots_txt( 

604 url: _Url, *, 

605 client: _httpx.AsyncClient, 

606 cache: RobotsCache, 

607) -> bool: 

608 ''' Checks if URL is allowed by robots.txt. ''' 

609 if url.scheme not in ( 'http', 'https' ): return True 609 ↛ exitline 609 didn't return from function '_check_robots_txt' because the return on line 609 wasn't executed

610 url_s = url.geturl( ) 

611 domain = _extract_domain( url ) 

612 try: parser = await cache.access( client, domain ) 

613 except _exceptions.RobotsTxtAccessFailure as exc: 

614 _scribe.warning( 

615 f"robots.txt access failed for {domain}: {exc.cause}. " 

616 f"Proceeding without robots.txt validation." ) 

617 return True # Allow access when robots.txt unavailable 

618 try: return parser.can_fetch( cache.user_agent, url_s ) 

619 except Exception as exc: 

620 _scribe.debug( f"robots.txt check failed for {url_s}: {exc}" ) 

621 return True # if no robots.txt, then assume URL allowed 

622 

623 

624def _detect_charset_with_fallback( 

625 content: bytes, headers: _httpx.Headers, default: str 

626) -> str: 

627 ''' Detects charset from headers with content-based fallback. ''' 

628 header_charset = _extract_charset_from_headers( headers, '' ) 

629 if header_charset: 

630 return header_charset 

631 detected_charset = __.detext.detect_charset( content ) 

632 return detected_charset or default 

633 

634 

635def _detect_mimetype_with_fallback( 

636 content: bytes, headers: _httpx.Headers, url: str 

637) -> str: 

638 ''' Detects MIME type from headers with content-based fallback. ''' 

639 header_mimetype = _extract_mimetype_from_headers( headers ) 

640 if header_mimetype: 640 ↛ 642line 640 didn't jump to line 642 because the condition on line 640 was always true

641 return header_mimetype 

642 return __.detext.detect_mimetype( content, url ) or '' 

643 

644 

645def _extract_charset_from_headers( 

646 headers: _httpx.Headers, default: str 

647) -> str: 

648 ''' Extracts charset from Content-Type header. ''' 

649 content_type = headers.get( 'content-type', '' ) 

650 if isinstance( content_type, str ) and ';' in content_type: 

651 _, _, params = content_type.partition( ';' ) 

652 if 'charset=' in params: 

653 charset = params.split( 'charset=' )[ -1 ].strip( ) 

654 return charset.strip( '"\\\'\"' ) 

655 return default 

656 

657 

658def _extract_domain( url: _Url ) -> str: 

659 ''' Extracts domain from URL for robots.txt caching. ''' 

660 return f"{url.scheme}://{url.netloc}" 

661 

662 

663def _extract_mimetype_from_headers( headers: _httpx.Headers ) -> str: 

664 ''' Extracts mimetype from Content-Type header. ''' 

665 content_type = headers.get( 'content-type', '' ) 

666 if isinstance( content_type, str ) and ';' in content_type: 

667 mimetype, _, _ = content_type.partition( ';' ) 

668 return mimetype.strip( ) 

669 return content_type 

670 

671 

672async def _probe_url( 

673 url: _Url, /, *, 

674 duration_max: float, 

675 client: _httpx.AsyncClient, 

676 probe_cache: ProbeCache, 

677 robots_cache: RobotsCache, 

678) -> ProbeResponse: 

679 ''' Makes HEAD request with deduplication. ''' 

680 url_s = url.geturl( ) 

681 if not await _check_robots_txt( 681 ↛ 684line 681 didn't jump to line 684 because the condition on line 681 was never true

682 url, client = client, cache = robots_cache 

683 ): 

684 _scribe.debug( f"URL blocked by robots.txt: {url_s}" ) 

685 return _generics.Error( _exceptions.UrlImpermissibility( 

686 url_s, robots_cache.user_agent ) ) 

687 await _apply_request_delay( url, cache = robots_cache, client = client ) 

688 async with probe_cache.acquire_mutex_for( url_s ): 

689 try: 

690 response = await client.head( 

691 url_s, timeout = duration_max, follow_redirects = True ) 

692 except Exception as exc: 

693 _scribe.debug( f"HEAD request failed for {url_s}: {exc}" ) 

694 return _generics.Error( exc ) 

695 else: 

696 return _generics.Value( 

697 response.status_code < _http_success_threshold ) 

698 

699 

700async def _retrieve_robots_txt( 

701 client: _httpx.AsyncClient, cache: RobotsCache, domain: str 

702) -> __.Absential[ _RobotFileParser ]: 

703 ''' Fetches and parses robots.txt for domain. ''' 

704 robots_url = f"{domain}/robots.txt" 

705 async with cache.acquire_mutex_for( domain ): 

706 timeout = cache.request_timeout 

707 try: 

708 response = await client.get( 

709 robots_url, timeout = timeout, follow_redirects = True ) 

710 except Exception as exc: 

711 return await _cache_robots_txt_error( domain, cache, exc ) 

712 match response.status_code: 

713 case _HttpStatus.OK: lines = response.text.splitlines( ) 713 ↛ 714line 713 didn't jump to line 714 because the pattern on line 713 always matched

714 case _HttpStatus.NOT_FOUND: lines = [ ] 

715 case _: 

716 try: response.raise_for_status( ) 

717 except Exception as exc: 

718 return await _cache_robots_txt_error( domain, cache, exc ) 

719 robots_parser = _RobotFileParser( ) 

720 robots_parser.set_url( robots_url ) 

721 try: robots_parser.parse( lines ) 

722 except Exception as exc: 

723 return await _cache_robots_txt_error( domain, cache, exc ) 

724 result: RobotsResponse = _generics.Value( robots_parser ) 

725 return await _cache_robots_txt_result( cache, domain, result ) 

726 

727 

728async def _retrieve_url( 

729 url: _Url, /, *, 

730 duration_max: float, 

731 client: _httpx.AsyncClient, 

732 content_cache: ContentCache, 

733 robots_cache: RobotsCache, 

734) -> tuple[ ContentResponse, _httpx.Headers ]: 

735 ''' Makes GET request with deduplication. ''' 

736 url_s = url.geturl( ) 

737 if not await _check_robots_txt( 737 ↛ 740line 737 didn't jump to line 740 because the condition on line 737 was never true

738 url, cache = robots_cache, client = client 

739 ): 

740 return ( 

741 _generics.Error( _exceptions.UrlImpermissibility( 

742 url_s, robots_cache.user_agent ) ), 

743 _httpx.Headers( ) ) 

744 await _apply_request_delay( url, cache = robots_cache, client = client ) 

745 async with content_cache.acquire_mutex_for( url_s ): 

746 try: 

747 response = await client.get( 

748 url_s, timeout = duration_max, follow_redirects = True ) 

749 response.raise_for_status( ) 

750 except Exception as exc: 

751 _scribe.debug( f"GET request failed for {url_s}: {exc}" ) 

752 return _generics.Error( exc ), _httpx.Headers( ) 

753 else: return _generics.Value( response.content ), response.headers 

754 

755 

756def _validate_textual_content( 

757 content: bytes, headers: _httpx.Headers, url: str 

758) -> None: 

759 ''' Validates that content is textual via headers and content analysis. ''' 

760 mimetype = _detect_mimetype_with_fallback( content, headers, url ) 

761 if mimetype and not __.detext.is_textual_mimetype( mimetype ): 

762 raise _exceptions.HttpContentTypeInvalidity( 

763 url, mimetype, "text decoding" ) 

764 if not __.detext.is_textual_content( content ): 764 ↛ 765line 764 didn't jump to line 765 because the condition on line 764 was never true

765 raise _exceptions.HttpContentTypeInvalidity( 

766 url, mimetype or 'unknown', "content analysis" )