Coverage for sources/librovore/cacheproxy.py: 87%

1# vim: set filetype=python fileencoding=utf-8:

2# -*- coding: utf-8 -*-

4#============================================================================#

5# #

6# Licensed under the Apache License, Version 2.0 (the "License"); #

7# you may not use this file except in compliance with the License. #

8# You may obtain a copy of the License at #

9# #

10# http://www.apache.org/licenses/LICENSE-2.0 #

11# #

12# Unless required by applicable law or agreed to in writing, software #

13# distributed under the License is distributed on an "AS IS" BASIS, #

14# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. #

15# See the License for the specific language governing permissions and #

16# limitations under the License. #

17# #

18#============================================================================#

21''' HTTP cache for documentation URL access. '''

24from http import HTTPStatus as _HttpStatus

25from urllib.parse import ParseResult as _Url

26from urllib.robotparser import RobotFileParser as _RobotFileParser

28import appcore.generics as _generics

29import httpx as _httpx

31from . import __

32from . import exceptions as _exceptions

35HttpClientFactory: __.typx.TypeAlias = (

36 __.cabc.Callable[ [ ], _httpx.AsyncClient ] )

37ContentResponse: __.typx.TypeAlias = _generics.Result[ bytes, Exception ]

38ProbeResponse: __.typx.TypeAlias = _generics.Result[ bool, Exception ]

39RobotsResponse: __.typx.TypeAlias = (

40 _generics.Result[ _RobotFileParser, Exception ] )

43class CacheEntry( __.immut.DataclassObject ):

44 ''' Cache entry base. '''

46 timestamp: float

47 ttl: float

49 @property

50 def invalid( self ) -> bool:

51 ''' Checks if cache entry has exceeded its TTL. '''

52 return __.time.time( ) - self.timestamp > self.ttl

55class ContentCacheEntry( CacheEntry ):

56 ''' Cache entry for URL content with size tracking. '''

58 response: ContentResponse

59 headers: _httpx.Headers

60 size_bytes: int

62 @property

63 def memory_usage( self ) -> int:

64 ''' Calculates total memory usage including metadata. '''

65 return self.size_bytes + 100 # Overhead estimate

68class ProbeCacheEntry( CacheEntry ):

69 ''' Cache entry for URL probe results. '''

71 response: ProbeResponse

74class RobotsCacheEntry( CacheEntry ):

75 ''' Cache entry for robots.txt parser. '''

77 response: RobotsResponse

80class Cache( __.immut.Object ):

81 ''' Cache base with shared configuration attributes. '''

83 error_ttl: float = 30.0

84 success_ttl: float = 300.0

86 def __init__(

87 self, *,

88 error_ttl: __.Absential[ float ] = __.absent,

89 success_ttl: __.Absential[ float ] = __.absent,

90 delay_function: __.cabc.Callable[

91 [ float ], __.cabc.Awaitable[ None ]

92 ] = __.asyncio.sleep

93 ) -> None:

94 if not __.is_absent( error_ttl ): self.error_ttl = error_ttl

95 if not __.is_absent( success_ttl ): self.success_ttl = success_ttl

96 self.delay_function = delay_function

97 self._request_mutexes: dict[ str, __.asyncio.Lock ] = { }

99 @__.ctxl.asynccontextmanager

100 async def acquire_mutex_for( self, url: str ):

101 ''' Acquires mutex for HTTP request deduplication. '''

102 if url not in self._request_mutexes: # pragma: no branch

103 self._request_mutexes[ url ] = __.asyncio.Lock( )

104 mutex = self._request_mutexes[ url ]

105 async with mutex:

106 try: yield

107 finally: self._request_mutexes.pop( url, None )

108

109

110class RobotsCache( Cache ):

111 ''' Cache manager for robots.txt files with crawl delay tracking. '''

112

113 entries_max: int = 500

114 request_timeout: float = 5.0

115 ttl: float = 3600.0

116 user_agent: str = '*'

117

118 def __init__(

119 self, *,

120 entries_max: __.Absential[ int ] = __.absent,

121 ttl: __.Absential[ float ] = __.absent,

122 request_timeout: __.Absential[ float ] = __.absent,

123 user_agent: __.Absential[ str ] = __.absent,

124 **base_initargs: __.typx.Any

125 ) -> None:

126 super( ).__init__( **base_initargs )

127 if not __.is_absent( entries_max ): self.entries_max = entries_max

128 if not __.is_absent( ttl ): self.ttl = ttl

129 if not __.is_absent( request_timeout ):

130 self.request_timeout = request_timeout

131 if not __.is_absent( user_agent ): self.user_agent = user_agent

132 self._cache: dict[ str, RobotsCacheEntry ] = { }

133 self._recency: __.collections.deque[ str ] = __.collections.deque( )

134 self._request_delays: dict[ str, float ] = { }

135

136 @classmethod

137 def from_configuration(

138 cls, configuration: __.cabc.Mapping[ str, __.typx.Any ]

139 ) -> __.typx.Self:

140 ''' Creates RobotsCache instance from application configuration. '''

141 cache_config = configuration.get( 'cache', { } )

142 robots_ttl = cache_config.get( 'robots-ttl', 3600.0 )

143 return cls( ttl = robots_ttl )

144

145 async def access( self, domain: str ) -> __.Absential[ _RobotFileParser ]:

146 ''' Retrieves cached robots.txt parser if valid. '''

147 if domain not in self._cache: return __.absent

148 entry = self._cache[ domain ]

149 if entry.invalid:

150 self._remove( domain )

151 return __.absent

152 self._record_access( domain )

153 return entry.response.extract( )

154

155 def assign_delay( self, domain: str, delay_seconds: float ) -> None:

156 ''' Sets next allowed request time for domain. '''

157 self._request_delays[ domain ] = __.time.time( ) + delay_seconds

158

159 def calculate_delay_remainder( self, domain: str ) -> float:

160 ''' Returns remaining crawl delay time for domain. '''

161 allow_at = self._request_delays.get( domain, 0.0 )

162 if not allow_at: return 0.0

163 remainder = allow_at - __.time.time( )

164 return max( 0.0, remainder )

165

166 def determine_ttl( self, response: RobotsResponse ) -> float:

167 ''' Determines appropriate TTL based on response type. '''

168 if response.is_value( ): return self.ttl

169 return self.error_ttl

170

171 async def store(

172 self, domain: str, response: RobotsResponse, ttl: float

173 ) -> None:

174 ''' Stores robots.txt parser in cache. '''

175 entry = RobotsCacheEntry(

176 response = response, timestamp = __.time.time( ), ttl = ttl )

177 self._cache[ domain ] = entry

178 self._record_access( domain )

179 self._evict_by_count( )

180

181 def _evict_by_count( self ) -> None:

182 ''' Evicts oldest entries when cache exceeds max size. '''

183 while (

184 len( self._cache ) > self.entries_max

185 and self._recency

186 ):

187 lru_domain = self._recency.popleft( )

188 if lru_domain in self._cache: # pragma: no branch

189 del self._cache[ lru_domain ]

190

191 def _record_access( self, domain: str ) -> None:

192 ''' Updates LRU access order for given domain. '''

193 with __.ctxl.suppress( ValueError ):

194 self._recency.remove( domain )

195 self._recency.append( domain )

196

197 def _remove( self, domain: str ) -> None:

198 ''' Removes entry from cache. '''

199 self._cache.pop( domain, None )

200 with __.ctxl.suppress( ValueError ):

201 self._recency.remove( domain )

202

203

204class ContentCache( Cache, instances_mutables = ( '_memory_total', ) ):

205 ''' Cache manager for URL content (GET requests) with memory tracking. '''

206

207 memory_max: int = 32 * 1024 * 1024

208

209 def __init__(

210 self, *,

211 robots_cache: __.Absential[ RobotsCache ] = __.absent,

212 memory_max: __.Absential[ int ] = __.absent,

213 **base_initargs: __.typx.Any

214 ) -> None:

215 super( ).__init__( **base_initargs )

216 if __.is_absent( robots_cache ):

217 self.robots_cache = RobotsCache( **base_initargs )

218 else: self.robots_cache = robots_cache

219 if not __.is_absent( memory_max ): self.memory_max = memory_max

220 self._cache: dict[ str, ContentCacheEntry ] = { }

221 self._memory_total = 0

222 self._recency: __.collections.deque[ str ] = __.collections.deque( )

223

224 @classmethod

225 def from_configuration(

226 cls,

227 configuration: __.cabc.Mapping[ str, __.typx.Any ],

228 robots_cache: __.Absential[ RobotsCache ] = __.absent

229 ) -> __.typx.Self:

230 ''' Creates ContentCache instance from application configuration. '''

231 cache_config = configuration.get( 'cache', { } )

232 content_ttl = cache_config.get( 'content-ttl', 300.0 )

233 memory_limit = cache_config.get( 'memory-limit', 33554432 )

234 nomargs = {

235 'success_ttl': content_ttl,

236 'memory_max': memory_limit,

237 }

238 if not __.is_absent( robots_cache ):

239 nomargs[ 'robots_cache' ] = robots_cache

240 return cls( **nomargs )

241

242 async def access(

243 self, url: str

244 ) -> __.Absential[ tuple[ bytes, _httpx.Headers ] ]:

245 ''' Retrieves cached content if valid. '''

246 if url not in self._cache: return __.absent

247 entry = self._cache[ url ]

248 if entry.invalid:

249 self._remove( url )

250 return __.absent

251 self._record_access( url )

252 return ( entry.response.extract( ), entry.headers )

253

254 def determine_ttl( self, response: ContentResponse ) -> float:

255 ''' Determines appropriate TTL based on response type. '''

256 if response.is_value( ):

257 return self.success_ttl

258 # TODO: Inspect exception type for more granular TTL

259 return self.error_ttl

260

261 async def retrieve_url(

262 self,

263 url: _Url, /, *,

264 duration_max: float = 30.0,

265 client_factory: HttpClientFactory = _httpx.AsyncClient,

266 ) -> bytes:

267 ''' Convenience method for retrieving URL content. '''

268 return await retrieve_url(

269 self, url,

270 duration_max = duration_max,

271 client_factory = client_factory )

272

273 async def store(

274 self, url: str, response: ContentResponse,

275 headers: _httpx.Headers, ttl: float

276 ) -> None:

277 ''' Stores content in cache with memory management. '''

278 size_bytes = self._calculate_response_size( response )

279 entry = ContentCacheEntry(

280 response = response,

281 headers = headers,

282 timestamp = __.time.time( ),

283 ttl = ttl,

284 size_bytes = size_bytes )

285 if old_entry := self._cache.get( url ):

286 self._memory_total -= old_entry.memory_usage

287 self._cache[ url ] = entry

288 self._memory_total += entry.memory_usage

289 self._record_access( url )

290 self._evict_by_memory( )

291

292 def _calculate_response_size( self, response: ContentResponse ) -> int:

293 ''' Calculates memory footprint of cached response. '''

294 if response.is_value( ):

295 content = response.extract( )

296 return len( content )

297 return 100 # Conservative estimate for exception overhead

298

299 def _evict_by_memory( self ) -> None:

300 ''' Evicts LRU entries until memory usage is under limit. '''

301 while (

302 self._memory_total > self.memory_max

303 and self._recency

304 ):

305 lru_url = self._recency.popleft( )

306 if lru_url in self._cache: # pragma: no branch

307 entry = self._cache[ lru_url ]

308 self._memory_total -= entry.memory_usage

309 del self._cache[ lru_url ]

310 _scribe.debug( f"Evicted cache entry: {lru_url}" )

311

312 def _record_access( self, url: str ) -> None:

313 ''' Updates LRU access order for given URL. '''

314 with __.ctxl.suppress( ValueError ):

315 self._recency.remove( url )

316 self._recency.append( url )

317

318 def _remove( self, url: str ) -> None:

319 ''' Removes entry from cache and updates memory tracking. '''

320 if entry := self._cache.pop( url, None ):

321 self._memory_total -= entry.memory_usage

322 with __.ctxl.suppress( ValueError ):

323 self._recency.remove( url )

324

325

326class ProbeCache( Cache ):

327 ''' Cache manager for URL probe results (HEAD requests). '''

328

329 entries_max: int = 1000

330

331 def __init__(

332 self, *,

333 robots_cache: __.Absential[ RobotsCache ] = __.absent,

334 entries_max: __.Absential[ int ] = __.absent,

335 **base_initargs: __.typx.Any

336 ) -> None:

337 super( ).__init__( **base_initargs )

338 if __.is_absent( robots_cache ):

339 self.robots_cache = RobotsCache( **base_initargs )

340 else: self.robots_cache = robots_cache

341 if not __.is_absent( entries_max ): self.entries_max = entries_max

342 self._cache: dict[ str, ProbeCacheEntry ] = { }

343 self._recency: __.collections.deque[ str ] = __.collections.deque( )

344

345 @classmethod

346 def from_configuration(

347 cls,

348 configuration: __.cabc.Mapping[ str, __.typx.Any ],

349 robots_cache: __.Absential[ RobotsCache ] = __.absent

350 ) -> __.typx.Self:

351 ''' Creates ProbeCache instance from application configuration. '''

352 cache_config = configuration.get( 'cache', { } )

353 probe_ttl = cache_config.get( 'probe-ttl', 300.0 )

354 nomargs = { 'success_ttl': probe_ttl }

355 if not __.is_absent( robots_cache ):

356 nomargs[ 'robots_cache' ] = robots_cache

357 return cls( **nomargs )

358

359 async def access( self, url: str ) -> __.Absential[ bool ]:

360 ''' Retrieves cached probe result if valid. '''

361 if url not in self._cache: return __.absent

362 entry = self._cache[ url ]

363 if entry.invalid:

364 self._remove( url )

365 return __.absent

366 self._record_access( url )

367 return entry.response.extract( )

368

369 def determine_ttl( self, response: ProbeResponse ) -> float:

370 ''' Determines appropriate TTL based on response type. '''

371 if response.is_value( ):

372 return self.success_ttl

373 # TODO: Inspect exception type for more granular TTL

374 return self.error_ttl

375

376 async def probe_url(

377 self,

378 url: _Url, /, *,

379 duration_max: float = 10.0,

380 client_factory: HttpClientFactory = _httpx.AsyncClient,

381 ) -> bool:

382 ''' Convenience method for probing URL existence. '''

383 return await probe_url(

384 self, url,

385 duration_max = duration_max,

386 client_factory = client_factory )

387

388 async def store(

389 self, url: str, response: ProbeResponse, ttl: float

390 ) -> None:

391 ''' Stores probe result in cache. '''

392 entry = ProbeCacheEntry(

393 response = response,

394 timestamp = __.time.time( ),

395 ttl = ttl )

396 self._cache[ url ] = entry

397 self._record_access( url )

398 self._evict_by_count( )

399

400 def _evict_by_count( self ) -> None:

401 ''' Evicts oldest entries when cache exceeds max size. '''

402 while (

403 len( self._cache ) > self.entries_max

404 and self._recency

405 ):

406 lru_url = self._recency.popleft( )

407 if lru_url in self._cache: # pragma: no branch

408 del self._cache[ lru_url ]

409

410 def _record_access( self, url: str ) -> None:

411 ''' Updates LRU access order for given URL. '''

412 with __.ctxl.suppress( ValueError ):

413 self._recency.remove( url )

414 self._recency.append( url )

415

416 def _remove( self, url: str ) -> None:

417 ''' Removes entry from cache. '''

418 self._cache.pop( url, None )

419 with __.ctxl.suppress( ValueError ):

420 self._recency.remove( url )

421

422

423_http_success_threshold = 400

424

425

426class CacheContext( __.immut.DataclassObject ):

427 ''' Context carrying configured cache instances. '''

428

429 content_cache: ContentCache

430 probe_cache: ProbeCache

431 robots_cache: RobotsCache

432

433 @classmethod

434 def from_configuration(

435 cls,

436 configuration: __.cabc.Mapping[ str, __.typx.Any ]

437 ) -> __.typx.Self:

438 ''' Creates cache context from application configuration. '''

439 robots_cache = RobotsCache.from_configuration( configuration )

440 return cls(

441 content_cache = ContentCache.from_configuration(

442 configuration, robots_cache ),

443 probe_cache = ProbeCache.from_configuration(

444 configuration, robots_cache ),

445 robots_cache = robots_cache,

446 )

447

448

449_scribe = __.acquire_scribe( __name__ )

450

451

452def prepare(

453 auxdata: __.Globals

454) -> tuple[ ContentCache, ProbeCache, RobotsCache ]:

455 ''' Prepares cache instances from configuration.

456

457 Returns cache instances constructed from application configuration.

458 '''

459 configuration = auxdata.configuration

460 robots_cache = RobotsCache.from_configuration( configuration )

461 return (

462 ContentCache.from_configuration( configuration, robots_cache ),

463 ProbeCache.from_configuration( configuration, robots_cache ),

464 robots_cache,

465 )

466

467

468async def probe_url(

469 cache: ProbeCache,

470 url: _Url, *,

471 duration_max: float = 10.0,

472 client_factory: HttpClientFactory = _httpx.AsyncClient,

473) -> bool:

474 ''' Cached HEAD request to check URL existence. '''

475 url_s = url.geturl( )

476 match url.scheme:

477 case '' | 'file':

478 return __.Path( url.path ).exists( )

479 case 'http' | 'https':

480 result = await cache.access( url_s )

481 if not __.is_absent( result ): return result

482 async with client_factory( ) as client:

483 result = await _probe_url(

484 url, duration_max = duration_max,

485 client = client,

486 probe_cache = cache,

487 robots_cache = cache.robots_cache )

488 ttl = cache.determine_ttl( result )

489 await cache.store( url_s, result, ttl )

490 return result.extract( )

491 case _: return False

492

493

494async def retrieve_url(

495 cache: ContentCache,

496 url: _Url, *,

497 duration_max: float = 30.0,

498 client_factory: HttpClientFactory = _httpx.AsyncClient,

499) -> bytes:

500 ''' Cached GET request to fetch URL content as bytes. '''

501 url_s = url.geturl( )

502 match url.scheme:

503 case '' | 'file':

504 location = __.Path( url.path )

505 try: return location.read_bytes( )

506 except Exception as exc:

507 raise _exceptions.DocumentationInaccessibility(

508 url_s, exc ) from exc

509 case 'http' | 'https':

510 result = await cache.access( url_s )

511 if not __.is_absent( result ):

512 content_bytes, _ = result

513 return content_bytes

514 async with client_factory( ) as client:

515 result, headers = await _retrieve_url(

516 url,

517 duration_max = duration_max,

518 client = client,

519 content_cache = cache,

520 robots_cache = cache.robots_cache )

521 ttl = cache.determine_ttl( result )

522 await cache.store( url_s, result, headers, ttl )

523 return result.extract( )

524 case _:

525 raise _exceptions.DocumentationInaccessibility(

526 url_s, f"Unsupported scheme: {url.scheme}" )

527

528

529async def retrieve_url_as_text(

530 cache: ContentCache,

531 url: _Url, *,

532 duration_max: float = 30.0,

533 charset_default: str = 'utf-8',

534 client_factory: HttpClientFactory = _httpx.AsyncClient,

535) -> str:

536 ''' Cached GET request to fetch URL content as text. '''

537 url_s = url.geturl( )

538 match url.scheme:

539 case '' | 'file':

540 location = __.Path( url.path )

541 try: content_bytes = location.read_bytes( )

542 except Exception as exc:

543 raise _exceptions.DocumentationInaccessibility(

544 url_s, exc ) from exc

545 mimetype, charset = __.detext.detect_mimetype_and_charset(

546 content_bytes, location )

547 if not __.detext.is_textual_content( content_bytes ): 547 ↛ 548line 547 didn't jump to line 548 because the condition on line 547 was never true

548 raise _exceptions.DocumentationInaccessibility(

549 url_s, "Content analysis indicates non-textual data" )

550 encoding = charset or charset_default

551 return content_bytes.decode( encoding )

552 case 'http' | 'https':

553 result = await cache.access( url_s )

554 if not __.is_absent( result ):

555 content_bytes, headers = result

556 _validate_textual_content(

557 content_bytes, headers, url_s )

558 charset = _detect_charset_with_fallback(

559 content_bytes, headers, charset_default )

560 return content_bytes.decode( charset )

561 async with client_factory( ) as client:

562 result, headers = await _retrieve_url(

563 url, duration_max = duration_max,

564 client = client,

565 content_cache = cache,

566 robots_cache = cache.robots_cache )

567 ttl = cache.determine_ttl( result )

568 await cache.store( url_s, result, headers, ttl )

569 content_bytes = result.extract( )

570 _validate_textual_content(

571 content_bytes, headers, url_s )

572 charset = _detect_charset_with_fallback(

573 content_bytes, headers, charset_default )

574 return content_bytes.decode( charset )

575 case _:

576 raise _exceptions.DocumentationInaccessibility(

577 url_s, f"Unsupported scheme: {url.scheme}" )

578

579

580async def _apply_request_delay(

581 url: _Url,

582 client: _httpx.AsyncClient,

583 cache: RobotsCache,

584) -> None:

585 ''' Applies crawl delay to request if specified in robots.txt. '''

586 if url.scheme not in ( 'http', 'https' ): return 586 ↛ exitline 586 didn't return from function '_apply_request_delay' because the return on line 586 wasn't executed

587 domain = _extract_domain( url )

588 delay = cache.calculate_delay_remainder( domain )

589 if delay > 0: await cache.delay_function( delay )

590 parser = await cache.access( domain )

591 if __.is_absent( parser ): 591 ↛ 592line 591 didn't jump to line 592 because the condition on line 591 was never true

592 parser = await _retrieve_robots_txt( client, cache, domain )

593 if not __.is_absent( parser ): 593 ↛ exitline 593 didn't return from function '_apply_request_delay' because the condition on line 593 was always true

594 try: delay = parser.crawl_delay( cache.user_agent )

595 except Exception as exc:

596 _scribe.debug( f"Failed to get crawl delay for {domain}: {exc}" )

597 else:

598 if delay: cache.assign_delay( domain, float( delay ) )

599

600

601async def _cache_robots_txt_error(

602 domain: str, cache: RobotsCache, error: Exception

603) -> __.Absential[ _RobotFileParser ]:

604 _scribe.debug( f"Failed to fetch/parse robots.txt from {domain}: {error}" )

605 result: RobotsResponse = _generics.Error( error )

606 return await _cache_robots_txt_result( cache, domain, result )

607

608

609async def _cache_robots_txt_result(

610 cache: RobotsCache, domain: str, result: RobotsResponse

611) -> __.Absential[ _RobotFileParser ]:

612 ttl = cache.determine_ttl( result )

613 await cache.store( domain, result, ttl )

614 return result.extract( ) if result.is_value( ) else __.absent

615

616

617async def _check_robots_txt(

618 url: _Url, *,

619 client: _httpx.AsyncClient,

620 cache: RobotsCache,

621) -> bool:

622 ''' Checks if URL is allowed by robots.txt. '''

623 if url.scheme not in ( 'http', 'https' ): return True 623 ↛ exitline 623 didn't return from function '_check_robots_txt' because the return on line 623 wasn't executed

624 url_s = url.geturl( )

625 domain = _extract_domain( url )

626 parser = await cache.access( domain )

627 if __.is_absent( parser ): 627 ↛ 630line 627 didn't jump to line 630 because the condition on line 627 was always true

628 parser = await _retrieve_robots_txt( client, cache, domain )

629 if __.is_absent( parser ): return True

630 try: return parser.can_fetch( cache.user_agent, url_s )

631 except Exception as exc:

632 _scribe.debug( f"robots.txt check failed for {url_s}: {exc}" )

633 return True # if no robots.txt, then assume URL allowed

634

635

636def _detect_charset_with_fallback(

637 content: bytes, headers: _httpx.Headers, default: str

638) -> str:

639 ''' Detects charset from headers with content-based fallback. '''

640 header_charset = _extract_charset_from_headers( headers, '' )

641 if header_charset:

642 return header_charset

643 detected_charset = __.detext.detect_charset( content )

644 return detected_charset or default

645

646

647def _detect_mimetype_with_fallback(

648 content: bytes, headers: _httpx.Headers, url: str

649) -> str:

650 ''' Detects MIME type from headers with content-based fallback. '''

651 header_mimetype = _extract_mimetype_from_headers( headers )

652 if header_mimetype: 652 ↛ 654line 652 didn't jump to line 654 because the condition on line 652 was always true

653 return header_mimetype

654 return __.detext.detect_mimetype( content, url ) or ''

655

656

657def _extract_charset_from_headers(

658 headers: _httpx.Headers, default: str

659) -> str:

660 ''' Extracts charset from Content-Type header. '''

661 content_type = headers.get( 'content-type', '' )

662 if isinstance( content_type, str ) and ';' in content_type:

663 _, _, params = content_type.partition( ';' )

664 if 'charset=' in params:

665 charset = params.split( 'charset=' )[ -1 ].strip( )

666 return charset.strip( '"\\\'\"' )

667 return default

668

669

670def _extract_domain( url: _Url ) -> str:

671 ''' Extracts domain from URL for robots.txt caching. '''

672 return f"{url.scheme}://{url.netloc}"

673

674

675def _extract_mimetype_from_headers( headers: _httpx.Headers ) -> str:

676 ''' Extracts mimetype from Content-Type header. '''

677 content_type = headers.get( 'content-type', '' )

678 if isinstance( content_type, str ) and ';' in content_type:

679 mimetype, _, _ = content_type.partition( ';' )

680 return mimetype.strip( )

681 return content_type

682

683

684def _raise_non_textual_content( url: str ) -> None:

685 ''' Raises exception for non-textual content. '''

686 raise _exceptions.DocumentationInaccessibility(

687 url, "Content analysis indicates non-textual data" )

688

689

690def _raise_non_textual_mimetype( url: str, mimetype: str ) -> None:

691 ''' Raises exception for non-textual MIME type. '''

692 raise _exceptions.DocumentationInaccessibility(

693 url, f"Non-textual content detected: {mimetype}" )

694

695

696

697async def _probe_url(

698 url: _Url, /, *,

699 duration_max: float,

700 client: _httpx.AsyncClient,

701 probe_cache: ProbeCache,

702 robots_cache: RobotsCache,

703) -> ProbeResponse:

704 ''' Makes HEAD request with deduplication. '''

705 url_s = url.geturl( )

706 if not await _check_robots_txt( 706 ↛ 709line 706 didn't jump to line 709 because the condition on line 706 was never true

707 url, client = client, cache = robots_cache

708 ):

709 _scribe.debug( f"URL blocked by robots.txt: {url_s}" )

710 return _generics.Error( _exceptions.UrlImpermissibility(

711 url_s, robots_cache.user_agent ) )

712 await _apply_request_delay( url, cache = robots_cache, client = client )

713 async with probe_cache.acquire_mutex_for( url_s ):

714 try:

715 response = await client.head(

716 url_s, timeout = duration_max, follow_redirects = True )

717 except Exception as exc:

718 _scribe.debug( f"HEAD request failed for {url_s}: {exc}" )

719 return _generics.Error( exc )

720 else:

721 return _generics.Value(

722 response.status_code < _http_success_threshold )

723

724

725async def _retrieve_robots_txt(

726 client: _httpx.AsyncClient, cache: RobotsCache, domain: str

727) -> __.Absential[ _RobotFileParser ]:

728 ''' Fetches and parses robots.txt for domain. '''

729 robots_url = f"{domain}/robots.txt"

730 async with cache.acquire_mutex_for( domain ):

731 timeout = cache.request_timeout

732 try:

733 response = await client.get(

734 robots_url, timeout = timeout, follow_redirects = True )

735 except Exception as exc:

736 return await _cache_robots_txt_error( domain, cache, exc )

737 match response.status_code:

738 case _HttpStatus.OK: lines = response.text.splitlines( ) 738 ↛ 739line 738 didn't jump to line 739 because the pattern on line 738 always matched

739 case _HttpStatus.NOT_FOUND: lines = [ ]

740 case _:

741 try: response.raise_for_status( )

742 except Exception as exc:

743 return await _cache_robots_txt_error( domain, cache, exc )

744 robots_parser = _RobotFileParser( )

745 robots_parser.set_url( robots_url )

746 try: robots_parser.parse( lines )

747 except Exception as exc:

748 return await _cache_robots_txt_error( domain, cache, exc )

749 result: RobotsResponse = _generics.Value( robots_parser )

750 return await _cache_robots_txt_result( cache, domain, result )

751

752

753async def _retrieve_url(

754 url: _Url, /, *,

755 duration_max: float,

756 client: _httpx.AsyncClient,

757 content_cache: ContentCache,

758 robots_cache: RobotsCache,

759) -> tuple[ ContentResponse, _httpx.Headers ]:

760 ''' Makes GET request with deduplication. '''

761 url_s = url.geturl( )

762 if not await _check_robots_txt( 762 ↛ 765line 762 didn't jump to line 765 because the condition on line 762 was never true

763 url, cache = robots_cache, client = client

764 ):

765 return (

766 _generics.Error( _exceptions.UrlImpermissibility(

767 url_s, robots_cache.user_agent ) ),

768 _httpx.Headers( ) )

769 await _apply_request_delay( url, cache = robots_cache, client = client )

770 async with content_cache.acquire_mutex_for( url_s ):

771 try:

772 response = await client.get(

773 url_s, timeout = duration_max, follow_redirects = True )

774 response.raise_for_status( )

775 except Exception as exc:

776 _scribe.debug( f"GET request failed for {url_s}: {exc}" )

777 return _generics.Error( exc ), _httpx.Headers( )

778 else: return _generics.Value( response.content ), response.headers

779

780

781def _validate_textual_content(

782 content: bytes, headers: _httpx.Headers, url: str

783) -> None:

784 ''' Validates that content is textual via headers and content analysis. '''

785 mimetype = _detect_mimetype_with_fallback( content, headers, url )

786 if mimetype and not __.detext.is_textual_mimetype( mimetype ):

787 raise _exceptions.HttpContentTypeInvalidity(

788 url, mimetype, "text decoding" )

789 if not __.detext.is_textual_content( content ): 789 ↛ 790line 789 didn't jump to line 790 because the condition on line 789 was never true

790 raise _exceptions.HttpContentTypeInvalidity(

791 url, mimetype or 'unknown', "content analysis" )