Coverage for sources/librovore/cacheproxy.py: 87%

1# vim: set filetype=python fileencoding=utf-8:

2# -*- coding: utf-8 -*-

4#============================================================================#

5# #

6# Licensed under the Apache License, Version 2.0 (the "License"); #

7# you may not use this file except in compliance with the License. #

8# You may obtain a copy of the License at #

9# #

10# http://www.apache.org/licenses/LICENSE-2.0 #

11# #

12# Unless required by applicable law or agreed to in writing, software #

13# distributed under the License is distributed on an "AS IS" BASIS, #

14# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. #

15# See the License for the specific language governing permissions and #

16# limitations under the License. #

17# #

18#============================================================================#

21''' HTTP cache for documentation URL access. '''

24from http import HTTPStatus as _HttpStatus

25from urllib.parse import ParseResult as _Url

26from urllib.robotparser import RobotFileParser as _RobotFileParser

28import appcore.generics as _generics

29import httpx as _httpx

31from . import __

32from . import exceptions as _exceptions

35HttpClientFactory: __.typx.TypeAlias = (

36 __.cabc.Callable[ [ ], _httpx.AsyncClient ] )

37ContentResponse: __.typx.TypeAlias = _generics.Result[ bytes, Exception ]

38ProbeResponse: __.typx.TypeAlias = _generics.Result[ bool, Exception ]

39RobotsResponse: __.typx.TypeAlias = (

40 _generics.Result[ _RobotFileParser, Exception ] )

43class CacheEntry( __.immut.DataclassObject ):

44 ''' Cache entry base. '''

46 timestamp: float

47 ttl: float

49 @property

50 def invalid( self ) -> bool:

51 ''' Checks if cache entry has exceeded its TTL. '''

52 return __.time.time( ) - self.timestamp > self.ttl

55class ContentCacheEntry( CacheEntry ):

56 ''' Cache entry for URL content with size tracking. '''

58 response: ContentResponse

59 headers: _httpx.Headers

60 size_bytes: int

62 @property

63 def memory_usage( self ) -> int:

64 ''' Calculates total memory usage including metadata. '''

65 return self.size_bytes + 100 # Overhead estimate

68class ProbeCacheEntry( CacheEntry ):

69 ''' Cache entry for URL probe results. '''

71 response: ProbeResponse

74class RobotsCacheEntry( CacheEntry ):

75 ''' Cache entry for robots.txt parser. '''

77 response: RobotsResponse

80class Cache( __.immut.Object ):

81 ''' Cache base with shared configuration attributes. '''

83 error_ttl: float = 30.0

84 success_ttl: float = 300.0

86 def __init__(

87 self, *,

88 error_ttl: __.Absential[ float ] = __.absent,

89 success_ttl: __.Absential[ float ] = __.absent,

90 delay_function: __.cabc.Callable[

91 [ float ], __.cabc.Awaitable[ None ]

92 ] = __.asyncio.sleep

93 ) -> None:

94 if not __.is_absent( error_ttl ): self.error_ttl = error_ttl

95 if not __.is_absent( success_ttl ): self.success_ttl = success_ttl

96 self.delay_function = delay_function

97 self._request_mutexes: dict[ str, __.asyncio.Lock ] = { }

99 @__.ctxl.asynccontextmanager

100 async def acquire_mutex_for( self, url: str ):

101 ''' Acquires mutex for HTTP request deduplication. '''

102 if url not in self._request_mutexes: # pragma: no branch

103 self._request_mutexes[ url ] = __.asyncio.Lock( )

104 mutex = self._request_mutexes[ url ]

105 async with mutex:

106 try: yield

107 finally: self._request_mutexes.pop( url, None )

108

109

110class RobotsCache( Cache ):

111 ''' Cache manager for robots.txt files with crawl delay tracking. '''

112

113 entries_max: int = 500

114 request_timeout: float = 5.0

115 ttl: float = 3600.0

116 user_agent: str = '*'

117

118 def __init__(

119 self, *,

120 entries_max: __.Absential[ int ] = __.absent,

121 ttl: __.Absential[ float ] = __.absent,

122 request_timeout: __.Absential[ float ] = __.absent,

123 user_agent: __.Absential[ str ] = __.absent,

124 **base_initargs: __.typx.Any

125 ) -> None:

126 super( ).__init__( **base_initargs )

127 if not __.is_absent( entries_max ): self.entries_max = entries_max

128 if not __.is_absent( ttl ): self.ttl = ttl

129 if not __.is_absent( request_timeout ):

130 self.request_timeout = request_timeout

131 if not __.is_absent( user_agent ): self.user_agent = user_agent

132 self._cache: dict[ str, RobotsCacheEntry ] = { }

133 self._recency: __.collections.deque[ str ] = __.collections.deque( )

134 self._request_delays: dict[ str, float ] = { }

135

136 @classmethod

137 def from_configuration(

138 cls, configuration: __.cabc.Mapping[ str, __.typx.Any ]

139 ) -> __.typx.Self:

140 ''' Creates RobotsCache instance from application configuration. '''

141 cache_config = configuration.get( 'cache', { } )

142 robots_ttl = cache_config.get( 'robots-ttl', 3600.0 )

143 return cls( ttl = robots_ttl )

144

145 async def access( self, domain: str ) -> __.Absential[ _RobotFileParser ]:

146 ''' Retrieves cached robots.txt parser if valid. '''

147 if domain not in self._cache: return __.absent

148 entry = self._cache[ domain ]

149 if entry.invalid:

150 self._remove( domain )

151 return __.absent

152 self._record_access( domain )

153 return entry.response.extract( )

154

155 def assign_delay( self, domain: str, delay_seconds: float ) -> None:

156 ''' Sets next allowed request time for domain. '''

157 self._request_delays[ domain ] = __.time.time( ) + delay_seconds

158

159 def calculate_delay_remainder( self, domain: str ) -> float:

160 ''' Returns remaining crawl delay time for domain. '''

161 allow_at = self._request_delays.get( domain, 0.0 )

162 if not allow_at: return 0.0

163 remainder = allow_at - __.time.time( )

164 return max( 0.0, remainder )

165

166 def determine_ttl( self, response: RobotsResponse ) -> float:

167 ''' Determines appropriate TTL based on response type. '''

168 if response.is_value( ): return self.ttl

169 return self.error_ttl

170

171 async def store(

172 self, domain: str, response: RobotsResponse, ttl: float

173 ) -> None:

174 ''' Stores robots.txt parser in cache. '''

175 entry = RobotsCacheEntry(

176 response = response, timestamp = __.time.time( ), ttl = ttl )

177 self._cache[ domain ] = entry

178 self._record_access( domain )

179 self._evict_by_count( )

180

181 def _evict_by_count( self ) -> None:

182 ''' Evicts oldest entries when cache exceeds max size. '''

183 while (

184 len( self._cache ) > self.entries_max

185 and self._recency

186 ):

187 lru_domain = self._recency.popleft( )

188 if lru_domain in self._cache: # pragma: no branch

189 del self._cache[ lru_domain ]

190

191 def _record_access( self, domain: str ) -> None:

192 ''' Updates LRU access order for given domain. '''

193 with __.ctxl.suppress( ValueError ):

194 self._recency.remove( domain )

195 self._recency.append( domain )

196

197 def _remove( self, domain: str ) -> None:

198 ''' Removes entry from cache. '''

199 self._cache.pop( domain, None )

200 with __.ctxl.suppress( ValueError ):

201 self._recency.remove( domain )

202

203

204class ContentCache( Cache, instances_mutables = ( '_memory_total', ) ):

205 ''' Cache manager for URL content (GET requests) with memory tracking. '''

206

207 memory_max: int = 32 * 1024 * 1024

208

209 def __init__(

210 self, *,

211 robots_cache: __.Absential[ RobotsCache ] = __.absent,

212 memory_max: __.Absential[ int ] = __.absent,

213 **base_initargs: __.typx.Any

214 ) -> None:

215 super( ).__init__( **base_initargs )

216 if __.is_absent( robots_cache ):

217 self.robots_cache = RobotsCache( **base_initargs )

218 else: self.robots_cache = robots_cache

219 if not __.is_absent( memory_max ): self.memory_max = memory_max

220 self._cache: dict[ str, ContentCacheEntry ] = { }

221 self._memory_total = 0

222 self._recency: __.collections.deque[ str ] = __.collections.deque( )

223

224 @classmethod

225 def from_configuration(

226 cls,

227 configuration: __.cabc.Mapping[ str, __.typx.Any ],

228 robots_cache: __.Absential[ RobotsCache ] = __.absent

229 ) -> __.typx.Self:

230 ''' Creates ContentCache instance from application configuration. '''

231 cache_config = configuration.get( 'cache', { } )

232 content_ttl = cache_config.get( 'content-ttl', 300.0 )

233 memory_limit = cache_config.get( 'memory-limit', 33554432 )

234 nomargs = {

235 'success_ttl': content_ttl,

236 'memory_max': memory_limit,

237 }

238 if not __.is_absent( robots_cache ):

239 nomargs[ 'robots_cache' ] = robots_cache

240 return cls( **nomargs )

241

242 async def access(

243 self, url: str

244 ) -> __.Absential[ tuple[ bytes, _httpx.Headers ] ]:

245 ''' Retrieves cached content if valid. '''

246 if url not in self._cache: return __.absent

247 entry = self._cache[ url ]

248 if entry.invalid:

249 self._remove( url )

250 return __.absent

251 self._record_access( url )

252 return ( entry.response.extract( ), entry.headers )

253

254 def determine_ttl( self, response: ContentResponse ) -> float:

255 ''' Determines appropriate TTL based on response type. '''

256 if response.is_value( ):

257 return self.success_ttl

258 # TODO: Inspect exception type for more granular TTL

259 return self.error_ttl

260

261 async def retrieve_url(

262 self,

263 url: _Url, /, *,

264 duration_max: float = 30.0,

265 client_factory: HttpClientFactory = _httpx.AsyncClient,

266 ) -> bytes:

267 ''' Convenience method for retrieving URL content. '''

268 return await retrieve_url(

269 self, url,

270 duration_max = duration_max,

271 client_factory = client_factory )

272

273 async def store(

274 self, url: str, response: ContentResponse,

275 headers: _httpx.Headers, ttl: float

276 ) -> None:

277 ''' Stores content in cache with memory management. '''

278 size_bytes = self._calculate_response_size( response )

279 entry = ContentCacheEntry(

280 response = response,

281 headers = headers,

282 timestamp = __.time.time( ),

283 ttl = ttl,

284 size_bytes = size_bytes )

285 if old_entry := self._cache.get( url ):

286 self._memory_total -= old_entry.memory_usage

287 self._cache[ url ] = entry

288 self._memory_total += entry.memory_usage

289 self._record_access( url )

290 self._evict_by_memory( )

291

292 def _calculate_response_size( self, response: ContentResponse ) -> int:

293 ''' Calculates memory footprint of cached response. '''

294 if response.is_value( ):

295 content = response.extract( )

296 return len( content )

297 return 100 # Conservative estimate for exception overhead

298

299 def _evict_by_memory( self ) -> None:

300 ''' Evicts LRU entries until memory usage is under limit. '''

301 while (

302 self._memory_total > self.memory_max

303 and self._recency

304 ):

305 lru_url = self._recency.popleft( )

306 if lru_url in self._cache: # pragma: no branch

307 entry = self._cache[ lru_url ]

308 self._memory_total -= entry.memory_usage

309 del self._cache[ lru_url ]

310 _scribe.debug( f"Evicted cache entry: {lru_url}" )

311

312 def _record_access( self, url: str ) -> None:

313 ''' Updates LRU access order for given URL. '''

314 with __.ctxl.suppress( ValueError ):

315 self._recency.remove( url )

316 self._recency.append( url )

317

318 def _remove( self, url: str ) -> None:

319 ''' Removes entry from cache and updates memory tracking. '''

320 if entry := self._cache.pop( url, None ):

321 self._memory_total -= entry.memory_usage

322 with __.ctxl.suppress( ValueError ):

323 self._recency.remove( url )

324

325

326class ProbeCache( Cache ):

327 ''' Cache manager for URL probe results (HEAD requests). '''

328

329 entries_max: int = 1000

330

331 def __init__(

332 self, *,

333 robots_cache: __.Absential[ RobotsCache ] = __.absent,

334 entries_max: __.Absential[ int ] = __.absent,

335 **base_initargs: __.typx.Any

336 ) -> None:

337 super( ).__init__( **base_initargs )

338 if __.is_absent( robots_cache ):

339 self.robots_cache = RobotsCache( **base_initargs )

340 else: self.robots_cache = robots_cache

341 if not __.is_absent( entries_max ): self.entries_max = entries_max

342 self._cache: dict[ str, ProbeCacheEntry ] = { }

343 self._recency: __.collections.deque[ str ] = __.collections.deque( )

344

345 @classmethod

346 def from_configuration(

347 cls,

348 configuration: __.cabc.Mapping[ str, __.typx.Any ],

349 robots_cache: __.Absential[ RobotsCache ] = __.absent

350 ) -> __.typx.Self:

351 ''' Creates ProbeCache instance from application configuration. '''

352 cache_config = configuration.get( 'cache', { } )

353 probe_ttl = cache_config.get( 'probe-ttl', 300.0 )

354 nomargs = { 'success_ttl': probe_ttl }

355 if not __.is_absent( robots_cache ):

356 nomargs[ 'robots_cache' ] = robots_cache

357 return cls( **nomargs )

358

359 async def access( self, url: str ) -> __.Absential[ bool ]:

360 ''' Retrieves cached probe result if valid. '''

361 if url not in self._cache: return __.absent

362 entry = self._cache[ url ]

363 if entry.invalid:

364 self._remove( url )

365 return __.absent

366 self._record_access( url )

367 return entry.response.extract( )

368

369 def determine_ttl( self, response: ProbeResponse ) -> float:

370 ''' Determines appropriate TTL based on response type. '''

371 if response.is_value( ):

372 return self.success_ttl

373 # TODO: Inspect exception type for more granular TTL

374 return self.error_ttl

375

376 async def probe_url(

377 self,

378 url: _Url, /, *,

379 duration_max: float = 10.0,

380 client_factory: HttpClientFactory = _httpx.AsyncClient,

381 ) -> bool:

382 ''' Convenience method for probing URL existence. '''

383 return await probe_url(

384 self, url,

385 duration_max = duration_max,

386 client_factory = client_factory )

387

388 async def store(

389 self, url: str, response: ProbeResponse, ttl: float

390 ) -> None:

391 ''' Stores probe result in cache. '''

392 entry = ProbeCacheEntry(

393 response = response,

394 timestamp = __.time.time( ),

395 ttl = ttl )

396 self._cache[ url ] = entry

397 self._record_access( url )

398 self._evict_by_count( )

399

400 def _evict_by_count( self ) -> None:

401 ''' Evicts oldest entries when cache exceeds max size. '''

402 while (

403 len( self._cache ) > self.entries_max

404 and self._recency

405 ):

406 lru_url = self._recency.popleft( )

407 if lru_url in self._cache: # pragma: no branch

408 del self._cache[ lru_url ]

409

410 def _record_access( self, url: str ) -> None:

411 ''' Updates LRU access order for given URL. '''

412 with __.ctxl.suppress( ValueError ):

413 self._recency.remove( url )

414 self._recency.append( url )

415

416 def _remove( self, url: str ) -> None:

417 ''' Removes entry from cache. '''

418 self._cache.pop( url, None )

419 with __.ctxl.suppress( ValueError ):

420 self._recency.remove( url )

421

422

423_http_success_threshold = 400

424

425

426_scribe = __.acquire_scribe( __name__ )

427

428

429def prepare(

430 auxdata: __.Globals

431) -> tuple[ ContentCache, ProbeCache, RobotsCache ]:

432 ''' Prepares cache instances from configuration.

433

434 Returns cache instances constructed from application configuration.

435 '''

436 configuration = auxdata.configuration

437 robots_cache = RobotsCache.from_configuration( configuration )

438 return (

439 ContentCache.from_configuration( configuration, robots_cache ),

440 ProbeCache.from_configuration( configuration, robots_cache ),

441 robots_cache,

442 )

443

444

445async def probe_url(

446 cache: ProbeCache,

447 url: _Url, *,

448 duration_max: float = 10.0,

449 client_factory: HttpClientFactory = _httpx.AsyncClient,

450) -> bool:

451 ''' Cached HEAD request to check URL existence. '''

452 url_s = url.geturl( )

453 match url.scheme:

454 case '' | 'file':

455 return __.Path( url.path ).exists( )

456 case 'http' | 'https':

457 result = await cache.access( url_s )

458 if not __.is_absent( result ): return result

459 async with client_factory( ) as client:

460 result = await _probe_url(

461 url, duration_max = duration_max,

462 client = client,

463 probe_cache = cache,

464 robots_cache = cache.robots_cache )

465 ttl = cache.determine_ttl( result )

466 await cache.store( url_s, result, ttl )

467 return result.extract( )

468 case _: return False

469

470

471async def retrieve_url(

472 cache: ContentCache,

473 url: _Url, *,

474 duration_max: float = 30.0,

475 client_factory: HttpClientFactory = _httpx.AsyncClient,

476) -> bytes:

477 ''' Cached GET request to fetch URL content as bytes. '''

478 url_s = url.geturl( )

479 match url.scheme:

480 case '' | 'file':

481 location = __.Path( url.path )

482 try: return location.read_bytes( )

483 except Exception as exc:

484 raise _exceptions.DocumentationInaccessibility(

485 url_s, exc ) from exc

486 case 'http' | 'https':

487 result = await cache.access( url_s )

488 if not __.is_absent( result ):

489 content_bytes, _ = result

490 return content_bytes

491 async with client_factory( ) as client:

492 result, headers = await _retrieve_url(

493 url,

494 duration_max = duration_max,

495 client = client,

496 content_cache = cache,

497 robots_cache = cache.robots_cache )

498 ttl = cache.determine_ttl( result )

499 await cache.store( url_s, result, headers, ttl )

500 return result.extract( )

501 case _:

502 raise _exceptions.DocumentationInaccessibility(

503 url_s, f"Unsupported scheme: {url.scheme}" )

504

505

506async def retrieve_url_as_text(

507 cache: ContentCache,

508 url: _Url, *,

509 duration_max: float = 30.0,

510 charset_default: str = 'utf-8',

511 client_factory: HttpClientFactory = _httpx.AsyncClient,

512) -> str:

513 ''' Cached GET request to fetch URL content as text. '''

514 url_s = url.geturl( )

515 match url.scheme:

516 case '' | 'file':

517 location = __.Path( url.path )

518 try: content_bytes = location.read_bytes( )

519 except Exception as exc:

520 raise _exceptions.DocumentationInaccessibility(

521 url_s, exc ) from exc

522 mimetype, charset = __.detext.detect_mimetype_and_charset(

523 content_bytes, location )

524 if not __.detext.is_textual_content( content_bytes ): 524 ↛ 525line 524 didn't jump to line 525 because the condition on line 524 was never true

525 raise _exceptions.DocumentationInaccessibility(

526 url_s, "Content analysis indicates non-textual data" )

527 encoding = charset or charset_default

528 return content_bytes.decode( encoding )

529 case 'http' | 'https':

530 result = await cache.access( url_s )

531 if not __.is_absent( result ):

532 content_bytes, headers = result

533 _validate_textual_content(

534 content_bytes, headers, url_s )

535 charset = _detect_charset_with_fallback(

536 content_bytes, headers, charset_default )

537 return content_bytes.decode( charset )

538 async with client_factory( ) as client:

539 result, headers = await _retrieve_url(

540 url, duration_max = duration_max,

541 client = client,

542 content_cache = cache,

543 robots_cache = cache.robots_cache )

544 ttl = cache.determine_ttl( result )

545 await cache.store( url_s, result, headers, ttl )

546 content_bytes = result.extract( )

547 _validate_textual_content(

548 content_bytes, headers, url_s )

549 charset = _detect_charset_with_fallback(

550 content_bytes, headers, charset_default )

551 return content_bytes.decode( charset )

552 case _:

553 raise _exceptions.DocumentationInaccessibility(

554 url_s, f"Unsupported scheme: {url.scheme}" )

555

556

557async def _apply_request_delay(

558 url: _Url,

559 client: _httpx.AsyncClient,

560 cache: RobotsCache,

561) -> None:

562 ''' Applies crawl delay to request if specified in robots.txt. '''

563 if url.scheme not in ( 'http', 'https' ): return 563 ↛ exitline 563 didn't return from function '_apply_request_delay' because the return on line 563 wasn't executed

564 domain = _extract_domain( url )

565 delay = cache.calculate_delay_remainder( domain )

566 if delay > 0: await cache.delay_function( delay )

567 parser = await cache.access( domain )

568 if __.is_absent( parser ): 568 ↛ 569line 568 didn't jump to line 569 because the condition on line 568 was never true

569 parser = await _retrieve_robots_txt( client, cache, domain )

570 if not __.is_absent( parser ): 570 ↛ exitline 570 didn't return from function '_apply_request_delay' because the condition on line 570 was always true

571 try: delay = parser.crawl_delay( cache.user_agent )

572 except Exception as exc:

573 _scribe.debug( f"Failed to get crawl delay for {domain}: {exc}" )

574 else:

575 if delay: cache.assign_delay( domain, float( delay ) )

576

577

578async def _cache_robots_txt_error(

579 domain: str, cache: RobotsCache, error: Exception

580) -> __.Absential[ _RobotFileParser ]:

581 _scribe.debug( f"Failed to fetch/parse robots.txt from {domain}: {error}" )

582 result: RobotsResponse = _generics.Error( error )

583 return await _cache_robots_txt_result( cache, domain, result )

584

585

586async def _cache_robots_txt_result(

587 cache: RobotsCache, domain: str, result: RobotsResponse

588) -> __.Absential[ _RobotFileParser ]:

589 ttl = cache.determine_ttl( result )

590 await cache.store( domain, result, ttl )

591 return result.extract( ) if result.is_value( ) else __.absent

592

593

594async def _check_robots_txt(

595 url: _Url, *,

596 client: _httpx.AsyncClient,

597 cache: RobotsCache,

598) -> bool:

599 ''' Checks if URL is allowed by robots.txt. '''

600 if url.scheme not in ( 'http', 'https' ): return True 600 ↛ exitline 600 didn't return from function '_check_robots_txt' because the return on line 600 wasn't executed

601 url_s = url.geturl( )

602 domain = _extract_domain( url )

603 parser = await cache.access( domain )

604 if __.is_absent( parser ): 604 ↛ 607line 604 didn't jump to line 607 because the condition on line 604 was always true

605 parser = await _retrieve_robots_txt( client, cache, domain )

606 if __.is_absent( parser ): return True

607 try: return parser.can_fetch( cache.user_agent, url_s )

608 except Exception as exc:

609 _scribe.debug( f"robots.txt check failed for {url_s}: {exc}" )

610 return True # if no robots.txt, then assume URL allowed

611

612

613def _detect_charset_with_fallback(

614 content: bytes, headers: _httpx.Headers, default: str

615) -> str:

616 ''' Detects charset from headers with content-based fallback. '''

617 header_charset = _extract_charset_from_headers( headers, '' )

618 if header_charset:

619 return header_charset

620 detected_charset = __.detext.detect_charset( content )

621 return detected_charset or default

622

623

624def _detect_mimetype_with_fallback(

625 content: bytes, headers: _httpx.Headers, url: str

626) -> str:

627 ''' Detects MIME type from headers with content-based fallback. '''

628 header_mimetype = _extract_mimetype_from_headers( headers )

629 if header_mimetype: 629 ↛ 631line 629 didn't jump to line 631 because the condition on line 629 was always true

630 return header_mimetype

631 return __.detext.detect_mimetype( content, url ) or ''

632

633

634def _extract_charset_from_headers(

635 headers: _httpx.Headers, default: str

636) -> str:

637 ''' Extracts charset from Content-Type header. '''

638 content_type = headers.get( 'content-type', '' )

639 if isinstance( content_type, str ) and ';' in content_type:

640 _, _, params = content_type.partition( ';' )

641 if 'charset=' in params:

642 charset = params.split( 'charset=' )[ -1 ].strip( )

643 return charset.strip( '"\\\'\"' )

644 return default

645

646

647def _extract_domain( url: _Url ) -> str:

648 ''' Extracts domain from URL for robots.txt caching. '''

649 return f"{url.scheme}://{url.netloc}"

650

651

652def _extract_mimetype_from_headers( headers: _httpx.Headers ) -> str:

653 ''' Extracts mimetype from Content-Type header. '''

654 content_type = headers.get( 'content-type', '' )

655 if isinstance( content_type, str ) and ';' in content_type:

656 mimetype, _, _ = content_type.partition( ';' )

657 return mimetype.strip( )

658 return content_type

659

660

661async def _probe_url(

662 url: _Url, /, *,

663 duration_max: float,

664 client: _httpx.AsyncClient,

665 probe_cache: ProbeCache,

666 robots_cache: RobotsCache,

667) -> ProbeResponse:

668 ''' Makes HEAD request with deduplication. '''

669 url_s = url.geturl( )

670 if not await _check_robots_txt( 670 ↛ 673line 670 didn't jump to line 673 because the condition on line 670 was never true

671 url, client = client, cache = robots_cache

672 ):

673 _scribe.debug( f"URL blocked by robots.txt: {url_s}" )

674 return _generics.Error( _exceptions.UrlImpermissibility(

675 url_s, robots_cache.user_agent ) )

676 await _apply_request_delay( url, cache = robots_cache, client = client )

677 async with probe_cache.acquire_mutex_for( url_s ):

678 try:

679 response = await client.head(

680 url_s, timeout = duration_max, follow_redirects = True )

681 except Exception as exc:

682 _scribe.debug( f"HEAD request failed for {url_s}: {exc}" )

683 return _generics.Error( exc )

684 else:

685 return _generics.Value(

686 response.status_code < _http_success_threshold )

687

688

689async def _retrieve_robots_txt(

690 client: _httpx.AsyncClient, cache: RobotsCache, domain: str

691) -> __.Absential[ _RobotFileParser ]:

692 ''' Fetches and parses robots.txt for domain. '''

693 robots_url = f"{domain}/robots.txt"

694 async with cache.acquire_mutex_for( domain ):

695 timeout = cache.request_timeout

696 try:

697 response = await client.get(

698 robots_url, timeout = timeout, follow_redirects = True )

699 except Exception as exc:

700 return await _cache_robots_txt_error( domain, cache, exc )

701 match response.status_code:

702 case _HttpStatus.OK: lines = response.text.splitlines( ) 702 ↛ 703line 702 didn't jump to line 703 because the pattern on line 702 always matched

703 case _HttpStatus.NOT_FOUND: lines = [ ]

704 case _:

705 try: response.raise_for_status( )

706 except Exception as exc:

707 return await _cache_robots_txt_error( domain, cache, exc )

708 robots_parser = _RobotFileParser( )

709 robots_parser.set_url( robots_url )

710 try: robots_parser.parse( lines )

711 except Exception as exc:

712 return await _cache_robots_txt_error( domain, cache, exc )

713 result: RobotsResponse = _generics.Value( robots_parser )

714 return await _cache_robots_txt_result( cache, domain, result )

715

716

717async def _retrieve_url(

718 url: _Url, /, *,

719 duration_max: float,

720 client: _httpx.AsyncClient,

721 content_cache: ContentCache,

722 robots_cache: RobotsCache,

723) -> tuple[ ContentResponse, _httpx.Headers ]:

724 ''' Makes GET request with deduplication. '''

725 url_s = url.geturl( )

726 if not await _check_robots_txt( 726 ↛ 729line 726 didn't jump to line 729 because the condition on line 726 was never true

727 url, cache = robots_cache, client = client

728 ):

729 return (

730 _generics.Error( _exceptions.UrlImpermissibility(

731 url_s, robots_cache.user_agent ) ),

732 _httpx.Headers( ) )

733 await _apply_request_delay( url, cache = robots_cache, client = client )

734 async with content_cache.acquire_mutex_for( url_s ):

735 try:

736 response = await client.get(

737 url_s, timeout = duration_max, follow_redirects = True )

738 response.raise_for_status( )

739 except Exception as exc:

740 _scribe.debug( f"GET request failed for {url_s}: {exc}" )

741 return _generics.Error( exc ), _httpx.Headers( )

742 else: return _generics.Value( response.content ), response.headers

743

744

745def _validate_textual_content(

746 content: bytes, headers: _httpx.Headers, url: str

747) -> None:

748 ''' Validates that content is textual via headers and content analysis. '''

749 mimetype = _detect_mimetype_with_fallback( content, headers, url )

750 if mimetype and not __.detext.is_textual_mimetype( mimetype ):

751 raise _exceptions.HttpContentTypeInvalidity(

752 url, mimetype, "text decoding" )

753 if not __.detext.is_textual_content( content ): 753 ↛ 754line 753 didn't jump to line 754 because the condition on line 753 was never true

754 raise _exceptions.HttpContentTypeInvalidity(

755 url, mimetype or 'unknown', "content analysis" )