Coverage for sources/librovore/cacheproxy.py: 87%
399 statements
« prev ^ index » next coverage.py v7.10.6, created at 2025-09-02 00:02 +0000
« prev ^ index » next coverage.py v7.10.6, created at 2025-09-02 00:02 +0000
1# vim: set filetype=python fileencoding=utf-8:
2# -*- coding: utf-8 -*-
4#============================================================================#
5# #
6# Licensed under the Apache License, Version 2.0 (the "License"); #
7# you may not use this file except in compliance with the License. #
8# You may obtain a copy of the License at #
9# #
10# http://www.apache.org/licenses/LICENSE-2.0 #
11# #
12# Unless required by applicable law or agreed to in writing, software #
13# distributed under the License is distributed on an "AS IS" BASIS, #
14# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. #
15# See the License for the specific language governing permissions and #
16# limitations under the License. #
17# #
18#============================================================================#
21''' HTTP cache for documentation URL access. '''
24from http import HTTPStatus as _HttpStatus
25from urllib.parse import ParseResult as _Url
26from urllib.robotparser import RobotFileParser as _RobotFileParser
28import appcore.generics as _generics
29import httpx as _httpx
31from . import __
32from . import exceptions as _exceptions
35HttpClientFactory: __.typx.TypeAlias = (
36 __.cabc.Callable[ [ ], _httpx.AsyncClient ] )
37ContentResponse: __.typx.TypeAlias = _generics.Result[ bytes, Exception ]
38ProbeResponse: __.typx.TypeAlias = _generics.Result[ bool, Exception ]
39RobotsResponse: __.typx.TypeAlias = (
40 _generics.Result[ _RobotFileParser, Exception ] )
43class CacheEntry( __.immut.DataclassObject ):
44 ''' Cache entry base. '''
46 timestamp: float
47 ttl: float
49 @property
50 def invalid( self ) -> bool:
51 ''' Checks if cache entry has exceeded its TTL. '''
52 return __.time.time( ) - self.timestamp > self.ttl
55class ContentCacheEntry( CacheEntry ):
56 ''' Cache entry for URL content with size tracking. '''
58 response: ContentResponse
59 headers: _httpx.Headers
60 size_bytes: int
62 @property
63 def memory_usage( self ) -> int:
64 ''' Calculates total memory usage including metadata. '''
65 return self.size_bytes + 100 # Overhead estimate
68class ProbeCacheEntry( CacheEntry ):
69 ''' Cache entry for URL probe results. '''
71 response: ProbeResponse
74class RobotsCacheEntry( CacheEntry ):
75 ''' Cache entry for robots.txt parser. '''
77 response: RobotsResponse
80class Cache( __.immut.Object ):
81 ''' Cache base with shared configuration attributes. '''
83 error_ttl: float = 30.0
84 success_ttl: float = 300.0
86 def __init__(
87 self, *,
88 error_ttl: __.Absential[ float ] = __.absent,
89 success_ttl: __.Absential[ float ] = __.absent,
90 delay_function: __.cabc.Callable[
91 [ float ], __.cabc.Awaitable[ None ]
92 ] = __.asyncio.sleep
93 ) -> None:
94 if not __.is_absent( error_ttl ): self.error_ttl = error_ttl
95 if not __.is_absent( success_ttl ): self.success_ttl = success_ttl
96 self.delay_function = delay_function
97 self._request_mutexes: dict[ str, __.asyncio.Lock ] = { }
99 @__.ctxl.asynccontextmanager
100 async def acquire_mutex_for( self, url: str ):
101 ''' Acquires mutex for HTTP request deduplication. '''
102 if url not in self._request_mutexes: # pragma: no branch
103 self._request_mutexes[ url ] = __.asyncio.Lock( )
104 mutex = self._request_mutexes[ url ]
105 async with mutex:
106 try: yield
107 finally: self._request_mutexes.pop( url, None )
110class RobotsCache( Cache ):
111 ''' Cache manager for robots.txt files with crawl delay tracking. '''
113 entries_max: int = 500
114 request_timeout: float = 5.0
115 ttl: float = 3600.0
116 user_agent: str = '*'
118 def __init__(
119 self, *,
120 entries_max: __.Absential[ int ] = __.absent,
121 ttl: __.Absential[ float ] = __.absent,
122 request_timeout: __.Absential[ float ] = __.absent,
123 user_agent: __.Absential[ str ] = __.absent,
124 **base_initargs: __.typx.Any
125 ) -> None:
126 super( ).__init__( **base_initargs )
127 if not __.is_absent( entries_max ): self.entries_max = entries_max
128 if not __.is_absent( ttl ): self.ttl = ttl
129 if not __.is_absent( request_timeout ):
130 self.request_timeout = request_timeout
131 if not __.is_absent( user_agent ): self.user_agent = user_agent
132 self._cache: dict[ str, RobotsCacheEntry ] = { }
133 self._recency: __.collections.deque[ str ] = __.collections.deque( )
134 self._request_delays: dict[ str, float ] = { }
136 @classmethod
137 def from_configuration(
138 cls, configuration: __.cabc.Mapping[ str, __.typx.Any ]
139 ) -> __.typx.Self:
140 ''' Creates RobotsCache instance from application configuration. '''
141 cache_config = configuration.get( 'cache', { } )
142 robots_ttl = cache_config.get( 'robots-ttl', 3600.0 )
143 return cls( ttl = robots_ttl )
145 async def access( self, domain: str ) -> __.Absential[ _RobotFileParser ]:
146 ''' Retrieves cached robots.txt parser if valid. '''
147 if domain not in self._cache: return __.absent
148 entry = self._cache[ domain ]
149 if entry.invalid:
150 self._remove( domain )
151 return __.absent
152 self._record_access( domain )
153 return entry.response.extract( )
155 def assign_delay( self, domain: str, delay_seconds: float ) -> None:
156 ''' Sets next allowed request time for domain. '''
157 self._request_delays[ domain ] = __.time.time( ) + delay_seconds
159 def calculate_delay_remainder( self, domain: str ) -> float:
160 ''' Returns remaining crawl delay time for domain. '''
161 allow_at = self._request_delays.get( domain, 0.0 )
162 if not allow_at: return 0.0
163 remainder = allow_at - __.time.time( )
164 return max( 0.0, remainder )
166 def determine_ttl( self, response: RobotsResponse ) -> float:
167 ''' Determines appropriate TTL based on response type. '''
168 if response.is_value( ): return self.ttl
169 return self.error_ttl
171 async def store(
172 self, domain: str, response: RobotsResponse, ttl: float
173 ) -> None:
174 ''' Stores robots.txt parser in cache. '''
175 entry = RobotsCacheEntry(
176 response = response, timestamp = __.time.time( ), ttl = ttl )
177 self._cache[ domain ] = entry
178 self._record_access( domain )
179 self._evict_by_count( )
181 def _evict_by_count( self ) -> None:
182 ''' Evicts oldest entries when cache exceeds max size. '''
183 while (
184 len( self._cache ) > self.entries_max
185 and self._recency
186 ):
187 lru_domain = self._recency.popleft( )
188 if lru_domain in self._cache: # pragma: no branch
189 del self._cache[ lru_domain ]
191 def _record_access( self, domain: str ) -> None:
192 ''' Updates LRU access order for given domain. '''
193 with __.ctxl.suppress( ValueError ):
194 self._recency.remove( domain )
195 self._recency.append( domain )
197 def _remove( self, domain: str ) -> None:
198 ''' Removes entry from cache. '''
199 self._cache.pop( domain, None )
200 with __.ctxl.suppress( ValueError ):
201 self._recency.remove( domain )
204class ContentCache( Cache, instances_mutables = ( '_memory_total', ) ):
205 ''' Cache manager for URL content (GET requests) with memory tracking. '''
207 memory_max: int = 32 * 1024 * 1024
209 def __init__(
210 self, *,
211 robots_cache: __.Absential[ RobotsCache ] = __.absent,
212 memory_max: __.Absential[ int ] = __.absent,
213 **base_initargs: __.typx.Any
214 ) -> None:
215 super( ).__init__( **base_initargs )
216 if __.is_absent( robots_cache ):
217 self.robots_cache = RobotsCache( **base_initargs )
218 else: self.robots_cache = robots_cache
219 if not __.is_absent( memory_max ): self.memory_max = memory_max
220 self._cache: dict[ str, ContentCacheEntry ] = { }
221 self._memory_total = 0
222 self._recency: __.collections.deque[ str ] = __.collections.deque( )
224 @classmethod
225 def from_configuration(
226 cls,
227 configuration: __.cabc.Mapping[ str, __.typx.Any ],
228 robots_cache: __.Absential[ RobotsCache ] = __.absent
229 ) -> __.typx.Self:
230 ''' Creates ContentCache instance from application configuration. '''
231 cache_config = configuration.get( 'cache', { } )
232 content_ttl = cache_config.get( 'content-ttl', 300.0 )
233 memory_limit = cache_config.get( 'memory-limit', 33554432 )
234 nomargs = {
235 'success_ttl': content_ttl,
236 'memory_max': memory_limit,
237 }
238 if not __.is_absent( robots_cache ):
239 nomargs[ 'robots_cache' ] = robots_cache
240 return cls( **nomargs )
242 async def access(
243 self, url: str
244 ) -> __.Absential[ tuple[ bytes, _httpx.Headers ] ]:
245 ''' Retrieves cached content if valid. '''
246 if url not in self._cache: return __.absent
247 entry = self._cache[ url ]
248 if entry.invalid:
249 self._remove( url )
250 return __.absent
251 self._record_access( url )
252 return ( entry.response.extract( ), entry.headers )
254 def determine_ttl( self, response: ContentResponse ) -> float:
255 ''' Determines appropriate TTL based on response type. '''
256 if response.is_value( ):
257 return self.success_ttl
258 # TODO: Inspect exception type for more granular TTL
259 return self.error_ttl
261 async def retrieve_url(
262 self,
263 url: _Url, /, *,
264 duration_max: float = 30.0,
265 client_factory: HttpClientFactory = _httpx.AsyncClient,
266 ) -> bytes:
267 ''' Convenience method for retrieving URL content. '''
268 return await retrieve_url(
269 self, url,
270 duration_max = duration_max,
271 client_factory = client_factory )
273 async def store(
274 self, url: str, response: ContentResponse,
275 headers: _httpx.Headers, ttl: float
276 ) -> None:
277 ''' Stores content in cache with memory management. '''
278 size_bytes = self._calculate_response_size( response )
279 entry = ContentCacheEntry(
280 response = response,
281 headers = headers,
282 timestamp = __.time.time( ),
283 ttl = ttl,
284 size_bytes = size_bytes )
285 if old_entry := self._cache.get( url ):
286 self._memory_total -= old_entry.memory_usage
287 self._cache[ url ] = entry
288 self._memory_total += entry.memory_usage
289 self._record_access( url )
290 self._evict_by_memory( )
292 def _calculate_response_size( self, response: ContentResponse ) -> int:
293 ''' Calculates memory footprint of cached response. '''
294 if response.is_value( ):
295 content = response.extract( )
296 return len( content )
297 return 100 # Conservative estimate for exception overhead
299 def _evict_by_memory( self ) -> None:
300 ''' Evicts LRU entries until memory usage is under limit. '''
301 while (
302 self._memory_total > self.memory_max
303 and self._recency
304 ):
305 lru_url = self._recency.popleft( )
306 if lru_url in self._cache: # pragma: no branch
307 entry = self._cache[ lru_url ]
308 self._memory_total -= entry.memory_usage
309 del self._cache[ lru_url ]
310 _scribe.debug( f"Evicted cache entry: {lru_url}" )
312 def _record_access( self, url: str ) -> None:
313 ''' Updates LRU access order for given URL. '''
314 with __.ctxl.suppress( ValueError ):
315 self._recency.remove( url )
316 self._recency.append( url )
318 def _remove( self, url: str ) -> None:
319 ''' Removes entry from cache and updates memory tracking. '''
320 if entry := self._cache.pop( url, None ):
321 self._memory_total -= entry.memory_usage
322 with __.ctxl.suppress( ValueError ):
323 self._recency.remove( url )
326class ProbeCache( Cache ):
327 ''' Cache manager for URL probe results (HEAD requests). '''
329 entries_max: int = 1000
331 def __init__(
332 self, *,
333 robots_cache: __.Absential[ RobotsCache ] = __.absent,
334 entries_max: __.Absential[ int ] = __.absent,
335 **base_initargs: __.typx.Any
336 ) -> None:
337 super( ).__init__( **base_initargs )
338 if __.is_absent( robots_cache ):
339 self.robots_cache = RobotsCache( **base_initargs )
340 else: self.robots_cache = robots_cache
341 if not __.is_absent( entries_max ): self.entries_max = entries_max
342 self._cache: dict[ str, ProbeCacheEntry ] = { }
343 self._recency: __.collections.deque[ str ] = __.collections.deque( )
345 @classmethod
346 def from_configuration(
347 cls,
348 configuration: __.cabc.Mapping[ str, __.typx.Any ],
349 robots_cache: __.Absential[ RobotsCache ] = __.absent
350 ) -> __.typx.Self:
351 ''' Creates ProbeCache instance from application configuration. '''
352 cache_config = configuration.get( 'cache', { } )
353 probe_ttl = cache_config.get( 'probe-ttl', 300.0 )
354 nomargs = { 'success_ttl': probe_ttl }
355 if not __.is_absent( robots_cache ):
356 nomargs[ 'robots_cache' ] = robots_cache
357 return cls( **nomargs )
359 async def access( self, url: str ) -> __.Absential[ bool ]:
360 ''' Retrieves cached probe result if valid. '''
361 if url not in self._cache: return __.absent
362 entry = self._cache[ url ]
363 if entry.invalid:
364 self._remove( url )
365 return __.absent
366 self._record_access( url )
367 return entry.response.extract( )
369 def determine_ttl( self, response: ProbeResponse ) -> float:
370 ''' Determines appropriate TTL based on response type. '''
371 if response.is_value( ):
372 return self.success_ttl
373 # TODO: Inspect exception type for more granular TTL
374 return self.error_ttl
376 async def probe_url(
377 self,
378 url: _Url, /, *,
379 duration_max: float = 10.0,
380 client_factory: HttpClientFactory = _httpx.AsyncClient,
381 ) -> bool:
382 ''' Convenience method for probing URL existence. '''
383 return await probe_url(
384 self, url,
385 duration_max = duration_max,
386 client_factory = client_factory )
388 async def store(
389 self, url: str, response: ProbeResponse, ttl: float
390 ) -> None:
391 ''' Stores probe result in cache. '''
392 entry = ProbeCacheEntry(
393 response = response,
394 timestamp = __.time.time( ),
395 ttl = ttl )
396 self._cache[ url ] = entry
397 self._record_access( url )
398 self._evict_by_count( )
400 def _evict_by_count( self ) -> None:
401 ''' Evicts oldest entries when cache exceeds max size. '''
402 while (
403 len( self._cache ) > self.entries_max
404 and self._recency
405 ):
406 lru_url = self._recency.popleft( )
407 if lru_url in self._cache: # pragma: no branch
408 del self._cache[ lru_url ]
410 def _record_access( self, url: str ) -> None:
411 ''' Updates LRU access order for given URL. '''
412 with __.ctxl.suppress( ValueError ):
413 self._recency.remove( url )
414 self._recency.append( url )
416 def _remove( self, url: str ) -> None:
417 ''' Removes entry from cache. '''
418 self._cache.pop( url, None )
419 with __.ctxl.suppress( ValueError ):
420 self._recency.remove( url )
423_http_success_threshold = 400
426_scribe = __.acquire_scribe( __name__ )
429def prepare(
430 auxdata: __.Globals
431) -> tuple[ ContentCache, ProbeCache, RobotsCache ]:
432 ''' Prepares cache instances from configuration.
434 Returns cache instances constructed from application configuration.
435 '''
436 configuration = auxdata.configuration
437 robots_cache = RobotsCache.from_configuration( configuration )
438 return (
439 ContentCache.from_configuration( configuration, robots_cache ),
440 ProbeCache.from_configuration( configuration, robots_cache ),
441 robots_cache,
442 )
445async def probe_url(
446 cache: ProbeCache,
447 url: _Url, *,
448 duration_max: float = 10.0,
449 client_factory: HttpClientFactory = _httpx.AsyncClient,
450) -> bool:
451 ''' Cached HEAD request to check URL existence. '''
452 url_s = url.geturl( )
453 match url.scheme:
454 case '' | 'file':
455 return __.Path( url.path ).exists( )
456 case 'http' | 'https':
457 result = await cache.access( url_s )
458 if not __.is_absent( result ): return result
459 async with client_factory( ) as client:
460 result = await _probe_url(
461 url, duration_max = duration_max,
462 client = client,
463 probe_cache = cache,
464 robots_cache = cache.robots_cache )
465 ttl = cache.determine_ttl( result )
466 await cache.store( url_s, result, ttl )
467 return result.extract( )
468 case _: return False
471async def retrieve_url(
472 cache: ContentCache,
473 url: _Url, *,
474 duration_max: float = 30.0,
475 client_factory: HttpClientFactory = _httpx.AsyncClient,
476) -> bytes:
477 ''' Cached GET request to fetch URL content as bytes. '''
478 url_s = url.geturl( )
479 match url.scheme:
480 case '' | 'file':
481 location = __.Path( url.path )
482 try: return location.read_bytes( )
483 except Exception as exc:
484 raise _exceptions.DocumentationInaccessibility(
485 url_s, exc ) from exc
486 case 'http' | 'https':
487 result = await cache.access( url_s )
488 if not __.is_absent( result ):
489 content_bytes, _ = result
490 return content_bytes
491 async with client_factory( ) as client:
492 result, headers = await _retrieve_url(
493 url,
494 duration_max = duration_max,
495 client = client,
496 content_cache = cache,
497 robots_cache = cache.robots_cache )
498 ttl = cache.determine_ttl( result )
499 await cache.store( url_s, result, headers, ttl )
500 return result.extract( )
501 case _:
502 raise _exceptions.DocumentationInaccessibility(
503 url_s, f"Unsupported scheme: {url.scheme}" )
506async def retrieve_url_as_text(
507 cache: ContentCache,
508 url: _Url, *,
509 duration_max: float = 30.0,
510 charset_default: str = 'utf-8',
511 client_factory: HttpClientFactory = _httpx.AsyncClient,
512) -> str:
513 ''' Cached GET request to fetch URL content as text. '''
514 url_s = url.geturl( )
515 match url.scheme:
516 case '' | 'file':
517 location = __.Path( url.path )
518 try: content_bytes = location.read_bytes( )
519 except Exception as exc:
520 raise _exceptions.DocumentationInaccessibility(
521 url_s, exc ) from exc
522 mimetype, charset = __.detext.detect_mimetype_and_charset(
523 content_bytes, location )
524 if not __.detext.is_textual_content( content_bytes ): 524 ↛ 525line 524 didn't jump to line 525 because the condition on line 524 was never true
525 raise _exceptions.DocumentationInaccessibility(
526 url_s, "Content analysis indicates non-textual data" )
527 encoding = charset or charset_default
528 return content_bytes.decode( encoding )
529 case 'http' | 'https':
530 result = await cache.access( url_s )
531 if not __.is_absent( result ):
532 content_bytes, headers = result
533 _validate_textual_content(
534 content_bytes, headers, url_s )
535 charset = _detect_charset_with_fallback(
536 content_bytes, headers, charset_default )
537 return content_bytes.decode( charset )
538 async with client_factory( ) as client:
539 result, headers = await _retrieve_url(
540 url, duration_max = duration_max,
541 client = client,
542 content_cache = cache,
543 robots_cache = cache.robots_cache )
544 ttl = cache.determine_ttl( result )
545 await cache.store( url_s, result, headers, ttl )
546 content_bytes = result.extract( )
547 _validate_textual_content(
548 content_bytes, headers, url_s )
549 charset = _detect_charset_with_fallback(
550 content_bytes, headers, charset_default )
551 return content_bytes.decode( charset )
552 case _:
553 raise _exceptions.DocumentationInaccessibility(
554 url_s, f"Unsupported scheme: {url.scheme}" )
557async def _apply_request_delay(
558 url: _Url,
559 client: _httpx.AsyncClient,
560 cache: RobotsCache,
561) -> None:
562 ''' Applies crawl delay to request if specified in robots.txt. '''
563 if url.scheme not in ( 'http', 'https' ): return 563 ↛ exitline 563 didn't return from function '_apply_request_delay' because the return on line 563 wasn't executed
564 domain = _extract_domain( url )
565 delay = cache.calculate_delay_remainder( domain )
566 if delay > 0: await cache.delay_function( delay )
567 parser = await cache.access( domain )
568 if __.is_absent( parser ): 568 ↛ 569line 568 didn't jump to line 569 because the condition on line 568 was never true
569 parser = await _retrieve_robots_txt( client, cache, domain )
570 if not __.is_absent( parser ): 570 ↛ exitline 570 didn't return from function '_apply_request_delay' because the condition on line 570 was always true
571 try: delay = parser.crawl_delay( cache.user_agent )
572 except Exception as exc:
573 _scribe.debug( f"Failed to get crawl delay for {domain}: {exc}" )
574 else:
575 if delay: cache.assign_delay( domain, float( delay ) )
578async def _cache_robots_txt_error(
579 domain: str, cache: RobotsCache, error: Exception
580) -> __.Absential[ _RobotFileParser ]:
581 _scribe.debug( f"Failed to fetch/parse robots.txt from {domain}: {error}" )
582 result: RobotsResponse = _generics.Error( error )
583 return await _cache_robots_txt_result( cache, domain, result )
586async def _cache_robots_txt_result(
587 cache: RobotsCache, domain: str, result: RobotsResponse
588) -> __.Absential[ _RobotFileParser ]:
589 ttl = cache.determine_ttl( result )
590 await cache.store( domain, result, ttl )
591 return result.extract( ) if result.is_value( ) else __.absent
594async def _check_robots_txt(
595 url: _Url, *,
596 client: _httpx.AsyncClient,
597 cache: RobotsCache,
598) -> bool:
599 ''' Checks if URL is allowed by robots.txt. '''
600 if url.scheme not in ( 'http', 'https' ): return True 600 ↛ exitline 600 didn't return from function '_check_robots_txt' because the return on line 600 wasn't executed
601 url_s = url.geturl( )
602 domain = _extract_domain( url )
603 parser = await cache.access( domain )
604 if __.is_absent( parser ): 604 ↛ 607line 604 didn't jump to line 607 because the condition on line 604 was always true
605 parser = await _retrieve_robots_txt( client, cache, domain )
606 if __.is_absent( parser ): return True
607 try: return parser.can_fetch( cache.user_agent, url_s )
608 except Exception as exc:
609 _scribe.debug( f"robots.txt check failed for {url_s}: {exc}" )
610 return True # if no robots.txt, then assume URL allowed
613def _detect_charset_with_fallback(
614 content: bytes, headers: _httpx.Headers, default: str
615) -> str:
616 ''' Detects charset from headers with content-based fallback. '''
617 header_charset = _extract_charset_from_headers( headers, '' )
618 if header_charset:
619 return header_charset
620 detected_charset = __.detext.detect_charset( content )
621 return detected_charset or default
624def _detect_mimetype_with_fallback(
625 content: bytes, headers: _httpx.Headers, url: str
626) -> str:
627 ''' Detects MIME type from headers with content-based fallback. '''
628 header_mimetype = _extract_mimetype_from_headers( headers )
629 if header_mimetype: 629 ↛ 631line 629 didn't jump to line 631 because the condition on line 629 was always true
630 return header_mimetype
631 return __.detext.detect_mimetype( content, url ) or ''
634def _extract_charset_from_headers(
635 headers: _httpx.Headers, default: str
636) -> str:
637 ''' Extracts charset from Content-Type header. '''
638 content_type = headers.get( 'content-type', '' )
639 if isinstance( content_type, str ) and ';' in content_type:
640 _, _, params = content_type.partition( ';' )
641 if 'charset=' in params:
642 charset = params.split( 'charset=' )[ -1 ].strip( )
643 return charset.strip( '"\\\'\"' )
644 return default
647def _extract_domain( url: _Url ) -> str:
648 ''' Extracts domain from URL for robots.txt caching. '''
649 return f"{url.scheme}://{url.netloc}"
652def _extract_mimetype_from_headers( headers: _httpx.Headers ) -> str:
653 ''' Extracts mimetype from Content-Type header. '''
654 content_type = headers.get( 'content-type', '' )
655 if isinstance( content_type, str ) and ';' in content_type:
656 mimetype, _, _ = content_type.partition( ';' )
657 return mimetype.strip( )
658 return content_type
661async def _probe_url(
662 url: _Url, /, *,
663 duration_max: float,
664 client: _httpx.AsyncClient,
665 probe_cache: ProbeCache,
666 robots_cache: RobotsCache,
667) -> ProbeResponse:
668 ''' Makes HEAD request with deduplication. '''
669 url_s = url.geturl( )
670 if not await _check_robots_txt( 670 ↛ 673line 670 didn't jump to line 673 because the condition on line 670 was never true
671 url, client = client, cache = robots_cache
672 ):
673 _scribe.debug( f"URL blocked by robots.txt: {url_s}" )
674 return _generics.Error( _exceptions.UrlImpermissibility(
675 url_s, robots_cache.user_agent ) )
676 await _apply_request_delay( url, cache = robots_cache, client = client )
677 async with probe_cache.acquire_mutex_for( url_s ):
678 try:
679 response = await client.head(
680 url_s, timeout = duration_max, follow_redirects = True )
681 except Exception as exc:
682 _scribe.debug( f"HEAD request failed for {url_s}: {exc}" )
683 return _generics.Error( exc )
684 else:
685 return _generics.Value(
686 response.status_code < _http_success_threshold )
689async def _retrieve_robots_txt(
690 client: _httpx.AsyncClient, cache: RobotsCache, domain: str
691) -> __.Absential[ _RobotFileParser ]:
692 ''' Fetches and parses robots.txt for domain. '''
693 robots_url = f"{domain}/robots.txt"
694 async with cache.acquire_mutex_for( domain ):
695 timeout = cache.request_timeout
696 try:
697 response = await client.get(
698 robots_url, timeout = timeout, follow_redirects = True )
699 except Exception as exc:
700 return await _cache_robots_txt_error( domain, cache, exc )
701 match response.status_code:
702 case _HttpStatus.OK: lines = response.text.splitlines( ) 702 ↛ 703line 702 didn't jump to line 703 because the pattern on line 702 always matched
703 case _HttpStatus.NOT_FOUND: lines = [ ]
704 case _:
705 try: response.raise_for_status( )
706 except Exception as exc:
707 return await _cache_robots_txt_error( domain, cache, exc )
708 robots_parser = _RobotFileParser( )
709 robots_parser.set_url( robots_url )
710 try: robots_parser.parse( lines )
711 except Exception as exc:
712 return await _cache_robots_txt_error( domain, cache, exc )
713 result: RobotsResponse = _generics.Value( robots_parser )
714 return await _cache_robots_txt_result( cache, domain, result )
717async def _retrieve_url(
718 url: _Url, /, *,
719 duration_max: float,
720 client: _httpx.AsyncClient,
721 content_cache: ContentCache,
722 robots_cache: RobotsCache,
723) -> tuple[ ContentResponse, _httpx.Headers ]:
724 ''' Makes GET request with deduplication. '''
725 url_s = url.geturl( )
726 if not await _check_robots_txt( 726 ↛ 729line 726 didn't jump to line 729 because the condition on line 726 was never true
727 url, cache = robots_cache, client = client
728 ):
729 return (
730 _generics.Error( _exceptions.UrlImpermissibility(
731 url_s, robots_cache.user_agent ) ),
732 _httpx.Headers( ) )
733 await _apply_request_delay( url, cache = robots_cache, client = client )
734 async with content_cache.acquire_mutex_for( url_s ):
735 try:
736 response = await client.get(
737 url_s, timeout = duration_max, follow_redirects = True )
738 response.raise_for_status( )
739 except Exception as exc:
740 _scribe.debug( f"GET request failed for {url_s}: {exc}" )
741 return _generics.Error( exc ), _httpx.Headers( )
742 else: return _generics.Value( response.content ), response.headers
745def _validate_textual_content(
746 content: bytes, headers: _httpx.Headers, url: str
747) -> None:
748 ''' Validates that content is textual via headers and content analysis. '''
749 mimetype = _detect_mimetype_with_fallback( content, headers, url )
750 if mimetype and not __.detext.is_textual_mimetype( mimetype ):
751 raise _exceptions.HttpContentTypeInvalidity(
752 url, mimetype, "text decoding" )
753 if not __.detext.is_textual_content( content ): 753 ↛ 754line 753 didn't jump to line 754 because the condition on line 753 was never true
754 raise _exceptions.HttpContentTypeInvalidity(
755 url, mimetype or 'unknown', "content analysis" )