Coverage for sources/librovore/cacheproxy.py: 87%

1# vim: set filetype=python fileencoding=utf-8:

2# -*- coding: utf-8 -*-

4#============================================================================#

5# #

6# Licensed under the Apache License, Version 2.0 (the "License"); #

7# you may not use this file except in compliance with the License. #

8# You may obtain a copy of the License at #

9# #

10# http://www.apache.org/licenses/LICENSE-2.0 #

11# #

12# Unless required by applicable law or agreed to in writing, software #

13# distributed under the License is distributed on an "AS IS" BASIS, #

14# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. #

15# See the License for the specific language governing permissions and #

16# limitations under the License. #

17# #

18#============================================================================#

21''' HTTP cache for documentation URL access. '''

24from http import HTTPStatus as _HttpStatus

25from urllib.parse import ParseResult as _Url

26from urllib.robotparser import RobotFileParser as _RobotFileParser

28import appcore.generics as _generics

29import httpx as _httpx

31from . import __

32from . import exceptions as _exceptions

35HttpClientFactory: __.typx.TypeAlias = (

36 __.cabc.Callable[ [ ], _httpx.AsyncClient ] )

37ContentResponse: __.typx.TypeAlias = _generics.Result[ bytes, Exception ]

38ProbeResponse: __.typx.TypeAlias = _generics.Result[ bool, Exception ]

39RobotsResponse: __.typx.TypeAlias = (

40 _generics.Result[ _RobotFileParser, Exception ] )

43class CacheEntry( __.immut.DataclassObject ):

44 ''' Cache entry base. '''

46 timestamp: float

47 ttl: float

49 @property

50 def invalid( self ) -> bool:

51 ''' Checks if cache entry has exceeded its TTL. '''

52 return __.time.time( ) - self.timestamp > self.ttl

55class ContentCacheEntry( CacheEntry ):

56 ''' Cache entry for URL content with size tracking. '''

58 response: ContentResponse

59 headers: _httpx.Headers

60 size_bytes: int

62 @property

63 def memory_usage( self ) -> int:

64 ''' Calculates total memory usage including metadata. '''

65 return self.size_bytes + 100 # Overhead estimate

68class ProbeCacheEntry( CacheEntry ):

69 ''' Cache entry for URL probe results. '''

71 response: ProbeResponse

74class RobotsCacheEntry( CacheEntry ):

75 ''' Cache entry for robots.txt parser. '''

77 response: RobotsResponse

80class Cache( __.immut.Object ):

81 ''' Cache base with shared configuration attributes. '''

83 error_ttl: float = 30.0

84 success_ttl: float = 300.0

86 def __init__(

87 self, *,

88 error_ttl: __.Absential[ float ] = __.absent,

89 success_ttl: __.Absential[ float ] = __.absent,

90 delay_function: __.cabc.Callable[

91 [ float ], __.cabc.Awaitable[ None ]

92 ] = __.asyncio.sleep

93 ) -> None:

94 if not __.is_absent( error_ttl ): self.error_ttl = error_ttl

95 if not __.is_absent( success_ttl ): self.success_ttl = success_ttl

96 self.delay_function = delay_function

97 self._request_mutexes: dict[ str, __.asyncio.Lock ] = { }

99 @__.ctxl.asynccontextmanager

100 async def acquire_mutex_for( self, url: str ):

101 ''' Acquires mutex for HTTP request deduplication. '''

102 if url not in self._request_mutexes: # pragma: no branch

103 self._request_mutexes[ url ] = __.asyncio.Lock( )

104 mutex = self._request_mutexes[ url ]

105 async with mutex:

106 try: yield

107 finally: self._request_mutexes.pop( url, None )

108

109

110class RobotsCache( Cache ):

111 ''' Cache manager for robots.txt files with crawl delay tracking. '''

112

113 entries_max: int = 500

114 request_timeout: float = 5.0

115 ttl: float = 3600.0

116 user_agent: str = '*'

117 def __init__(

118 self, *,

119 entries_max: __.Absential[ int ] = __.absent,

120 ttl: __.Absential[ float ] = __.absent,

121 request_timeout: __.Absential[ float ] = __.absent,

122 user_agent: __.Absential[ str ] = __.absent,

123 **base_initargs: __.typx.Any

124 ) -> None:

125 super( ).__init__( **base_initargs )

126 if not __.is_absent( entries_max ): self.entries_max = entries_max

127 if not __.is_absent( ttl ): self.ttl = ttl

128 if not __.is_absent( request_timeout ):

129 self.request_timeout = request_timeout

130 if not __.is_absent( user_agent ): self.user_agent = user_agent

131 self._cache: dict[ str, RobotsCacheEntry ] = { }

132 self._recency: __.collections.deque[ str ] = __.collections.deque( )

133 self._request_delays: dict[ str, float ] = { }

134

135 @classmethod

136 def from_configuration(

137 cls, configuration: __.cabc.Mapping[ str, __.typx.Any ]

138 ) -> __.typx.Self:

139 ''' Creates RobotsCache instance from application configuration. '''

140 cache_config = configuration.get( 'cache', { } )

141 robots_ttl = cache_config.get( 'robots-ttl', 3600.0 )

142 return cls( ttl = robots_ttl )

143

144 async def access(

145 self, client: _httpx.AsyncClient, domain: str, # TODO: retriever

146 ) -> _RobotFileParser:

147 ''' Retrieves cached robots.txt parser if valid. '''

148 if domain not in self._cache:

149 await _retrieve_robots_txt( client, self, domain )

150 entry = self._cache[ domain ]

151 if entry.invalid: 151 ↛ 152line 151 didn't jump to line 152 because the condition on line 151 was never true

152 self._remove( domain )

153 await _retrieve_robots_txt( client, self, domain )

154 entry = self._cache[ domain ]

155 self._record_access( domain )

156 return entry.response.extract( )

157

158 def assign_delay( self, domain: str, delay_seconds: float ) -> None:

159 ''' Sets next allowed request time for domain. '''

160 self._request_delays[ domain ] = __.time.time( ) + delay_seconds

161

162 def calculate_delay_remainder( self, domain: str ) -> float:

163 ''' Returns remaining crawl delay time for domain. '''

164 allow_at = self._request_delays.get( domain, 0.0 )

165 if not allow_at: return 0.0

166 remainder = allow_at - __.time.time( )

167 return max( 0.0, remainder )

168

169 def determine_ttl( self, response: RobotsResponse ) -> float:

170 ''' Determines appropriate TTL based on response type. '''

171 if response.is_value( ): return self.ttl

172 return self.error_ttl

173

174 async def store(

175 self, domain: str, response: RobotsResponse, ttl: float

176 ) -> None:

177 ''' Stores robots.txt parser in cache. '''

178 entry = RobotsCacheEntry(

179 response = response, timestamp = __.time.time( ), ttl = ttl )

180 self._cache[ domain ] = entry

181 self._record_access( domain )

182 self._evict_by_count( )

183

184 def _evict_by_count( self ) -> None:

185 ''' Evicts oldest entries when cache exceeds max size. '''

186 while (

187 len( self._cache ) > self.entries_max

188 and self._recency

189 ):

190 lru_domain = self._recency.popleft( )

191 if lru_domain in self._cache: # pragma: no branch

192 del self._cache[ lru_domain ]

193

194 def _record_access( self, domain: str ) -> None:

195 ''' Updates LRU access order for given domain. '''

196 with __.ctxl.suppress( ValueError ):

197 self._recency.remove( domain )

198 self._recency.append( domain )

199

200 def _remove( self, domain: str ) -> None:

201 ''' Removes entry from cache. '''

202 self._cache.pop( domain, None )

203 with __.ctxl.suppress( ValueError ):

204 self._recency.remove( domain )

205

206

207class ContentCache( Cache, instances_mutables = ( '_memory_total', ) ):

208 ''' Cache manager for URL content (GET requests) with memory tracking. '''

209

210 memory_max: int = 32 * 1024 * 1024

211

212 def __init__(

213 self, *,

214 robots_cache: __.Absential[ RobotsCache ] = __.absent,

215 memory_max: __.Absential[ int ] = __.absent,

216 **base_initargs: __.typx.Any

217 ) -> None:

218 super( ).__init__( **base_initargs )

219 if __.is_absent( robots_cache ):

220 self.robots_cache = RobotsCache( **base_initargs )

221 else: self.robots_cache = robots_cache

222 if not __.is_absent( memory_max ): self.memory_max = memory_max

223 self._cache: dict[ str, ContentCacheEntry ] = { }

224 self._memory_total = 0

225 self._recency: __.collections.deque[ str ] = __.collections.deque( )

226

227 @classmethod

228 def from_configuration(

229 cls,

230 configuration: __.cabc.Mapping[ str, __.typx.Any ],

231 robots_cache: __.Absential[ RobotsCache ] = __.absent

232 ) -> __.typx.Self:

233 ''' Creates ContentCache instance from application configuration. '''

234 cache_config = configuration.get( 'cache', { } )

235 content_ttl = cache_config.get( 'content-ttl', 300.0 )

236 memory_limit = cache_config.get( 'memory-limit', 33554432 )

237 nomargs = {

238 'success_ttl': content_ttl,

239 'memory_max': memory_limit,

240 }

241 if not __.is_absent( robots_cache ):

242 nomargs[ 'robots_cache' ] = robots_cache

243 return cls( **nomargs )

244

245 async def access(

246 self, url: str

247 ) -> __.Absential[ tuple[ bytes, _httpx.Headers ] ]:

248 ''' Retrieves cached content if valid. '''

249 if url not in self._cache: return __.absent

250 entry = self._cache[ url ]

251 if entry.invalid:

252 self._remove( url )

253 return __.absent

254 self._record_access( url )

255 return ( entry.response.extract( ), entry.headers )

256

257 def determine_ttl( self, response: ContentResponse ) -> float:

258 ''' Determines appropriate TTL based on response type. '''

259 if response.is_value( ):

260 return self.success_ttl

261 # TODO: Inspect exception type for more granular TTL

262 return self.error_ttl

263

264 async def retrieve_url(

265 self,

266 url: _Url, /, *,

267 duration_max: float = 30.0,

268 client_factory: HttpClientFactory = _httpx.AsyncClient,

269 ) -> bytes:

270 ''' Convenience method for retrieving URL content. '''

271 return await retrieve_url(

272 self, url,

273 duration_max = duration_max,

274 client_factory = client_factory )

275

276 async def store(

277 self, url: str, response: ContentResponse,

278 headers: _httpx.Headers, ttl: float

279 ) -> None:

280 ''' Stores content in cache with memory management. '''

281 size_bytes = self._calculate_response_size( response )

282 entry = ContentCacheEntry(

283 response = response,

284 headers = headers,

285 timestamp = __.time.time( ),

286 ttl = ttl,

287 size_bytes = size_bytes )

288 if old_entry := self._cache.get( url ):

289 self._memory_total -= old_entry.memory_usage

290 self._cache[ url ] = entry

291 self._memory_total += entry.memory_usage

292 self._record_access( url )

293 self._evict_by_memory( )

294

295 def _calculate_response_size( self, response: ContentResponse ) -> int:

296 ''' Calculates memory footprint of cached response. '''

297 if response.is_value( ):

298 content = response.extract( )

299 return len( content )

300 return 100 # Conservative estimate for exception overhead

301

302 def _evict_by_memory( self ) -> None:

303 ''' Evicts LRU entries until memory usage is under limit. '''

304 while (

305 self._memory_total > self.memory_max

306 and self._recency

307 ):

308 lru_url = self._recency.popleft( )

309 if lru_url in self._cache: # pragma: no branch

310 entry = self._cache[ lru_url ]

311 self._memory_total -= entry.memory_usage

312 del self._cache[ lru_url ]

313 _scribe.debug( f"Evicted cache entry: {lru_url}" )

314

315 def _record_access( self, url: str ) -> None:

316 ''' Updates LRU access order for given URL. '''

317 with __.ctxl.suppress( ValueError ):

318 self._recency.remove( url )

319 self._recency.append( url )

320

321 def _remove( self, url: str ) -> None:

322 ''' Removes entry from cache and updates memory tracking. '''

323 if entry := self._cache.pop( url, None ):

324 self._memory_total -= entry.memory_usage

325 with __.ctxl.suppress( ValueError ):

326 self._recency.remove( url )

327

328

329class ProbeCache( Cache ):

330 ''' Cache manager for URL probe results (HEAD requests). '''

331

332 entries_max: int = 1000

333

334 def __init__(

335 self, *,

336 robots_cache: __.Absential[ RobotsCache ] = __.absent,

337 entries_max: __.Absential[ int ] = __.absent,

338 **base_initargs: __.typx.Any

339 ) -> None:

340 super( ).__init__( **base_initargs )

341 if __.is_absent( robots_cache ):

342 self.robots_cache = RobotsCache( **base_initargs )

343 else: self.robots_cache = robots_cache

344 if not __.is_absent( entries_max ): self.entries_max = entries_max

345 self._cache: dict[ str, ProbeCacheEntry ] = { }

346 self._recency: __.collections.deque[ str ] = __.collections.deque( )

347

348 @classmethod

349 def from_configuration(

350 cls,

351 configuration: __.cabc.Mapping[ str, __.typx.Any ],

352 robots_cache: __.Absential[ RobotsCache ] = __.absent

353 ) -> __.typx.Self:

354 ''' Creates ProbeCache instance from application configuration. '''

355 cache_config = configuration.get( 'cache', { } )

356 probe_ttl = cache_config.get( 'probe-ttl', 300.0 )

357 nomargs = { 'success_ttl': probe_ttl }

358 if not __.is_absent( robots_cache ):

359 nomargs[ 'robots_cache' ] = robots_cache

360 return cls( **nomargs )

361

362 async def access( self, url: str ) -> __.Absential[ bool ]:

363 ''' Retrieves cached probe result if valid. '''

364 if url not in self._cache: return __.absent

365 entry = self._cache[ url ]

366 if entry.invalid:

367 self._remove( url )

368 return __.absent

369 self._record_access( url )

370 return entry.response.extract( )

371

372 def determine_ttl( self, response: ProbeResponse ) -> float:

373 ''' Determines appropriate TTL based on response type. '''

374 if response.is_value( ):

375 return self.success_ttl

376 # TODO: Inspect exception type for more granular TTL

377 return self.error_ttl

378

379 async def probe_url(

380 self,

381 url: _Url, /, *,

382 duration_max: float = 10.0,

383 client_factory: HttpClientFactory = _httpx.AsyncClient,

384 ) -> bool:

385 ''' Convenience method for probing URL existence. '''

386 return await probe_url(

387 self, url,

388 duration_max = duration_max,

389 client_factory = client_factory )

390

391 async def store(

392 self, url: str, response: ProbeResponse, ttl: float

393 ) -> None:

394 ''' Stores probe result in cache. '''

395 entry = ProbeCacheEntry(

396 response = response,

397 timestamp = __.time.time( ),

398 ttl = ttl )

399 self._cache[ url ] = entry

400 self._record_access( url )

401 self._evict_by_count( )

402

403 def _evict_by_count( self ) -> None:

404 ''' Evicts oldest entries when cache exceeds max size. '''

405 while (

406 len( self._cache ) > self.entries_max

407 and self._recency

408 ):

409 lru_url = self._recency.popleft( )

410 if lru_url in self._cache: # pragma: no branch

411 del self._cache[ lru_url ]

412

413 def _record_access( self, url: str ) -> None:

414 ''' Updates LRU access order for given URL. '''

415 with __.ctxl.suppress( ValueError ):

416 self._recency.remove( url )

417 self._recency.append( url )

418

419 def _remove( self, url: str ) -> None:

420 ''' Removes entry from cache. '''

421 self._cache.pop( url, None )

422 with __.ctxl.suppress( ValueError ):

423 self._recency.remove( url )

424

425

426_http_success_threshold = 400

427

428

429_scribe = __.acquire_scribe( __name__ )

430

431

432def prepare(

433 auxdata: __.Globals

434) -> tuple[ ContentCache, ProbeCache, RobotsCache ]:

435 ''' Prepares cache instances from configuration.

436

437 Returns cache instances constructed from application configuration.

438 '''

439 configuration = auxdata.configuration

440 robots_cache = RobotsCache.from_configuration( configuration )

441 return (

442 ContentCache.from_configuration( configuration, robots_cache ),

443 ProbeCache.from_configuration( configuration, robots_cache ),

444 robots_cache,

445 )

446

447

448async def probe_url(

449 cache: ProbeCache,

450 url: _Url, *,

451 duration_max: float = 10.0,

452 client_factory: HttpClientFactory = _httpx.AsyncClient,

453) -> bool:

454 ''' Cached HEAD request to check URL existence. '''

455 url_s = url.geturl( )

456 match url.scheme:

457 case '' | 'file':

458 return __.Path( url.path ).exists( )

459 case 'http' | 'https':

460 result = await cache.access( url_s )

461 if not __.is_absent( result ): return result

462 async with client_factory( ) as client:

463 result = await _probe_url(

464 url, duration_max = duration_max,

465 client = client,

466 probe_cache = cache,

467 robots_cache = cache.robots_cache )

468 ttl = cache.determine_ttl( result )

469 await cache.store( url_s, result, ttl )

470 return result.extract( )

471 case _: return False

472

473

474async def retrieve_url(

475 cache: ContentCache,

476 url: _Url, *,

477 duration_max: float = 30.0,

478 client_factory: HttpClientFactory = _httpx.AsyncClient,

479) -> bytes:

480 ''' Cached GET request to fetch URL content as bytes. '''

481 url_s = url.geturl( )

482 match url.scheme:

483 case '' | 'file':

484 location = __.Path( url.path )

485 try: return location.read_bytes( )

486 except Exception as exc:

487 raise _exceptions.DocumentationInaccessibility(

488 url_s, exc ) from exc

489 case 'http' | 'https':

490 result = await cache.access( url_s )

491 if not __.is_absent( result ):

492 content_bytes, _ = result

493 return content_bytes

494 async with client_factory( ) as client:

495 result, headers = await _retrieve_url(

496 url,

497 duration_max = duration_max,

498 client = client,

499 content_cache = cache,

500 robots_cache = cache.robots_cache )

501 ttl = cache.determine_ttl( result )

502 await cache.store( url_s, result, headers, ttl )

503 return result.extract( )

504 case _:

505 raise _exceptions.DocumentationInaccessibility(

506 url_s, f"Unsupported scheme: {url.scheme}" )

507

508

509async def retrieve_url_as_text(

510 cache: ContentCache,

511 url: _Url, *,

512 duration_max: float = 30.0,

513 charset_default: str = 'utf-8',

514 client_factory: HttpClientFactory = _httpx.AsyncClient,

515) -> str:

516 ''' Cached GET request to fetch URL content as text. '''

517 url_s = url.geturl( )

518 match url.scheme:

519 case '' | 'file':

520 location = __.Path( url.path )

521 try: content_bytes = location.read_bytes( )

522 except Exception as exc:

523 raise _exceptions.DocumentationInaccessibility(

524 url_s, exc ) from exc

525 _, charset = __.detext.detect_mimetype_and_charset(

526 content_bytes, location )

527 if not __.detext.is_textual_content( content_bytes ): 527 ↛ 528line 527 didn't jump to line 528 because the condition on line 527 was never true

528 raise _exceptions.DocumentationInaccessibility(

529 url_s, "Content analysis indicates non-textual data" )

530 encoding = charset or charset_default

531 return content_bytes.decode( encoding )

532 case 'http' | 'https':

533 result = await cache.access( url_s )

534 if not __.is_absent( result ):

535 content_bytes, headers = result

536 _validate_textual_content(

537 content_bytes, headers, url_s )

538 charset = _detect_charset_with_fallback(

539 content_bytes, headers, charset_default )

540 return content_bytes.decode( charset )

541 async with client_factory( ) as client:

542 result, headers = await _retrieve_url(

543 url, duration_max = duration_max,

544 client = client,

545 content_cache = cache,

546 robots_cache = cache.robots_cache )

547 ttl = cache.determine_ttl( result )

548 await cache.store( url_s, result, headers, ttl )

549 content_bytes = result.extract( )

550 _validate_textual_content(

551 content_bytes, headers, url_s )

552 charset = _detect_charset_with_fallback(

553 content_bytes, headers, charset_default )

554 return content_bytes.decode( charset )

555 case _:

556 raise _exceptions.DocumentationInaccessibility(

557 url_s, f"Unsupported scheme: {url.scheme}" )

558

559

560async def _apply_request_delay(

561 url: _Url,

562 client: _httpx.AsyncClient,

563 cache: RobotsCache,

564) -> None:

565 ''' Applies crawl delay to request if specified in robots.txt. '''

566 if url.scheme not in ( 'http', 'https' ): return 566 ↛ exitline 566 didn't return from function '_apply_request_delay' because the return on line 566 wasn't executed

567 domain = _extract_domain( url )

568 delay = cache.calculate_delay_remainder( domain )

569 if delay > 0: await cache.delay_function( delay )

570 try: parser = await cache.access( client, domain )

571 except _exceptions.RobotsTxtAccessFailure as exc:

572 _scribe.debug(

573 f"robots.txt access failed for {domain}: {exc.cause}. "

574 f"Skipping crawl delay application." )

575 return # Skip crawl delay when robots.txt unavailable

576 try: delay = parser.crawl_delay( cache.user_agent )

577 except Exception as exc:

578 _scribe.debug( f"Failed to get crawl delay for {domain}: {exc}" )

579 else:

580 if delay: cache.assign_delay( domain, float( delay ) )

581

582

583async def _cache_robots_txt_error(

584 domain: str, cache: RobotsCache, error: Exception

585) -> __.Absential[ _RobotFileParser ]:

586 _scribe.debug( f"Failed to fetch/parse robots.txt from {domain}: {error}" )

587 if isinstance( error, _exceptions.RobotsTxtAccessFailure ): 587 ↛ 588line 587 didn't jump to line 588 because the condition on line 587 was never true

588 result: RobotsResponse = _generics.Error( error )

589 else:

590 access_failure = _exceptions.RobotsTxtAccessFailure( domain, error )

591 result = _generics.Error( access_failure )

592 return await _cache_robots_txt_result( cache, domain, result )

593

594

595async def _cache_robots_txt_result(

596 cache: RobotsCache, domain: str, result: RobotsResponse

597) -> __.Absential[ _RobotFileParser ]:

598 ttl = cache.determine_ttl( result )

599 await cache.store( domain, result, ttl )

600 return result.extract( ) if result.is_value( ) else __.absent

601

602

603async def _check_robots_txt(

604 url: _Url, *,

605 client: _httpx.AsyncClient,

606 cache: RobotsCache,

607) -> bool:

608 ''' Checks if URL is allowed by robots.txt. '''

609 if url.scheme not in ( 'http', 'https' ): return True 609 ↛ exitline 609 didn't return from function '_check_robots_txt' because the return on line 609 wasn't executed

610 url_s = url.geturl( )

611 domain = _extract_domain( url )

612 try: parser = await cache.access( client, domain )

613 except _exceptions.RobotsTxtAccessFailure as exc:

614 _scribe.warning(

615 f"robots.txt access failed for {domain}: {exc.cause}. "

616 f"Proceeding without robots.txt validation." )

617 return True # Allow access when robots.txt unavailable

618 try: return parser.can_fetch( cache.user_agent, url_s )

619 except Exception as exc:

620 _scribe.debug( f"robots.txt check failed for {url_s}: {exc}" )

621 return True # if no robots.txt, then assume URL allowed

622

623

624def _detect_charset_with_fallback(

625 content: bytes, headers: _httpx.Headers, default: str

626) -> str:

627 ''' Detects charset from headers with content-based fallback. '''

628 header_charset = _extract_charset_from_headers( headers, '' )

629 if header_charset:

630 return header_charset

631 detected_charset = __.detext.detect_charset( content )

632 return detected_charset or default

633

634

635def _detect_mimetype_with_fallback(

636 content: bytes, headers: _httpx.Headers, url: str

637) -> str:

638 ''' Detects MIME type from headers with content-based fallback. '''

639 header_mimetype = _extract_mimetype_from_headers( headers )

640 if header_mimetype: 640 ↛ 642line 640 didn't jump to line 642 because the condition on line 640 was always true

641 return header_mimetype

642 return __.detext.detect_mimetype( content, url ) or ''

643

644

645def _extract_charset_from_headers(

646 headers: _httpx.Headers, default: str

647) -> str:

648 ''' Extracts charset from Content-Type header. '''

649 content_type = headers.get( 'content-type', '' )

650 if isinstance( content_type, str ) and ';' in content_type:

651 _, _, params = content_type.partition( ';' )

652 if 'charset=' in params:

653 charset = params.split( 'charset=' )[ -1 ].strip( )

654 return charset.strip( '"\\\'\"' )

655 return default

656

657

658def _extract_domain( url: _Url ) -> str:

659 ''' Extracts domain from URL for robots.txt caching. '''

660 return f"{url.scheme}://{url.netloc}"

661

662

663def _extract_mimetype_from_headers( headers: _httpx.Headers ) -> str:

664 ''' Extracts mimetype from Content-Type header. '''

665 content_type = headers.get( 'content-type', '' )

666 if isinstance( content_type, str ) and ';' in content_type:

667 mimetype, _, _ = content_type.partition( ';' )

668 return mimetype.strip( )

669 return content_type

670

671

672async def _probe_url(

673 url: _Url, /, *,

674 duration_max: float,

675 client: _httpx.AsyncClient,

676 probe_cache: ProbeCache,

677 robots_cache: RobotsCache,

678) -> ProbeResponse:

679 ''' Makes HEAD request with deduplication. '''

680 url_s = url.geturl( )

681 if not await _check_robots_txt( 681 ↛ 684line 681 didn't jump to line 684 because the condition on line 681 was never true

682 url, client = client, cache = robots_cache

683 ):

684 _scribe.debug( f"URL blocked by robots.txt: {url_s}" )

685 return _generics.Error( _exceptions.UrlImpermissibility(

686 url_s, robots_cache.user_agent ) )

687 await _apply_request_delay( url, cache = robots_cache, client = client )

688 async with probe_cache.acquire_mutex_for( url_s ):

689 try:

690 response = await client.head(

691 url_s, timeout = duration_max, follow_redirects = True )

692 except Exception as exc:

693 _scribe.debug( f"HEAD request failed for {url_s}: {exc}" )

694 return _generics.Error( exc )

695 else:

696 return _generics.Value(

697 response.status_code < _http_success_threshold )

698

699

700async def _retrieve_robots_txt(

701 client: _httpx.AsyncClient, cache: RobotsCache, domain: str

702) -> __.Absential[ _RobotFileParser ]:

703 ''' Fetches and parses robots.txt for domain. '''

704 robots_url = f"{domain}/robots.txt"

705 async with cache.acquire_mutex_for( domain ):

706 timeout = cache.request_timeout

707 try:

708 response = await client.get(

709 robots_url, timeout = timeout, follow_redirects = True )

710 except Exception as exc:

711 return await _cache_robots_txt_error( domain, cache, exc )

712 match response.status_code:

713 case _HttpStatus.OK: lines = response.text.splitlines( ) 713 ↛ 714line 713 didn't jump to line 714 because the pattern on line 713 always matched

714 case _HttpStatus.NOT_FOUND: lines = [ ]

715 case _:

716 try: response.raise_for_status( )

717 except Exception as exc:

718 return await _cache_robots_txt_error( domain, cache, exc )

719 robots_parser = _RobotFileParser( )

720 robots_parser.set_url( robots_url )

721 try: robots_parser.parse( lines )

722 except Exception as exc:

723 return await _cache_robots_txt_error( domain, cache, exc )

724 result: RobotsResponse = _generics.Value( robots_parser )

725 return await _cache_robots_txt_result( cache, domain, result )

726

727

728async def _retrieve_url(

729 url: _Url, /, *,

730 duration_max: float,

731 client: _httpx.AsyncClient,

732 content_cache: ContentCache,

733 robots_cache: RobotsCache,

734) -> tuple[ ContentResponse, _httpx.Headers ]:

735 ''' Makes GET request with deduplication. '''

736 url_s = url.geturl( )

737 if not await _check_robots_txt( 737 ↛ 740line 737 didn't jump to line 740 because the condition on line 737 was never true

738 url, cache = robots_cache, client = client

739 ):

740 return (

741 _generics.Error( _exceptions.UrlImpermissibility(

742 url_s, robots_cache.user_agent ) ),

743 _httpx.Headers( ) )

744 await _apply_request_delay( url, cache = robots_cache, client = client )

745 async with content_cache.acquire_mutex_for( url_s ):

746 try:

747 response = await client.get(

748 url_s, timeout = duration_max, follow_redirects = True )

749 response.raise_for_status( )

750 except Exception as exc:

751 _scribe.debug( f"GET request failed for {url_s}: {exc}" )

752 return _generics.Error( exc ), _httpx.Headers( )

753 else: return _generics.Value( response.content ), response.headers

754

755

756def _validate_textual_content(

757 content: bytes, headers: _httpx.Headers, url: str

758) -> None:

759 ''' Validates that content is textual via headers and content analysis. '''

760 mimetype = _detect_mimetype_with_fallback( content, headers, url )

761 if mimetype and not __.detext.is_textual_mimetype( mimetype ):

762 raise _exceptions.HttpContentTypeInvalidity(

763 url, mimetype, "text decoding" )

764 if not __.detext.is_textual_content( content ): 764 ↛ 765line 764 didn't jump to line 765 because the condition on line 764 was never true

765 raise _exceptions.HttpContentTypeInvalidity(

766 url, mimetype or 'unknown', "content analysis" )