Coverage for sources/librovore/cacheproxy.py: 87%
404 statements
« prev ^ index » next coverage.py v7.11.0, created at 2025-10-20 18:40 +0000
« prev ^ index » next coverage.py v7.11.0, created at 2025-10-20 18:40 +0000
1# vim: set filetype=python fileencoding=utf-8:
2# -*- coding: utf-8 -*-
4#============================================================================#
5# #
6# Licensed under the Apache License, Version 2.0 (the "License"); #
7# you may not use this file except in compliance with the License. #
8# You may obtain a copy of the License at #
9# #
10# http://www.apache.org/licenses/LICENSE-2.0 #
11# #
12# Unless required by applicable law or agreed to in writing, software #
13# distributed under the License is distributed on an "AS IS" BASIS, #
14# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. #
15# See the License for the specific language governing permissions and #
16# limitations under the License. #
17# #
18#============================================================================#
21''' HTTP cache for documentation URL access. '''
24from http import HTTPStatus as _HttpStatus
25from urllib.parse import ParseResult as _Url
26from urllib.robotparser import RobotFileParser as _RobotFileParser
28import appcore.generics as _generics
29import httpx as _httpx
31from . import __
32from . import exceptions as _exceptions
35HttpClientFactory: __.typx.TypeAlias = (
36 __.cabc.Callable[ [ ], _httpx.AsyncClient ] )
37ContentResponse: __.typx.TypeAlias = _generics.Result[ bytes, Exception ]
38ProbeResponse: __.typx.TypeAlias = _generics.Result[ bool, Exception ]
39RobotsResponse: __.typx.TypeAlias = (
40 _generics.Result[ _RobotFileParser, Exception ] )
43class CacheEntry( __.immut.DataclassObject ):
44 ''' Cache entry base. '''
46 timestamp: float
47 ttl: float
49 @property
50 def invalid( self ) -> bool:
51 ''' Checks if cache entry has exceeded its TTL. '''
52 return __.time.time( ) - self.timestamp > self.ttl
55class ContentCacheEntry( CacheEntry ):
56 ''' Cache entry for URL content with size tracking. '''
58 response: ContentResponse
59 headers: _httpx.Headers
60 size_bytes: int
62 @property
63 def memory_usage( self ) -> int:
64 ''' Calculates total memory usage including metadata. '''
65 return self.size_bytes + 100 # Overhead estimate
68class ProbeCacheEntry( CacheEntry ):
69 ''' Cache entry for URL probe results. '''
71 response: ProbeResponse
74class RobotsCacheEntry( CacheEntry ):
75 ''' Cache entry for robots.txt parser. '''
77 response: RobotsResponse
80class Cache( __.immut.Object ):
81 ''' Cache base with shared configuration attributes. '''
83 error_ttl: float = 30.0
84 success_ttl: float = 300.0
86 def __init__(
87 self, *,
88 error_ttl: __.Absential[ float ] = __.absent,
89 success_ttl: __.Absential[ float ] = __.absent,
90 delay_function: __.cabc.Callable[
91 [ float ], __.cabc.Awaitable[ None ]
92 ] = __.asyncio.sleep
93 ) -> None:
94 if not __.is_absent( error_ttl ): self.error_ttl = error_ttl
95 if not __.is_absent( success_ttl ): self.success_ttl = success_ttl
96 self.delay_function = delay_function
97 self._request_mutexes: dict[ str, __.asyncio.Lock ] = { }
99 @__.ctxl.asynccontextmanager
100 async def acquire_mutex_for( self, url: str ):
101 ''' Acquires mutex for HTTP request deduplication. '''
102 if url not in self._request_mutexes: # pragma: no branch
103 self._request_mutexes[ url ] = __.asyncio.Lock( )
104 mutex = self._request_mutexes[ url ]
105 async with mutex:
106 try: yield
107 finally: self._request_mutexes.pop( url, None )
110class RobotsCache( Cache ):
111 ''' Cache manager for robots.txt files with crawl delay tracking. '''
113 entries_max: int = 500
114 request_timeout: float = 5.0
115 ttl: float = 3600.0
116 user_agent: str = '*'
117 def __init__(
118 self, *,
119 entries_max: __.Absential[ int ] = __.absent,
120 ttl: __.Absential[ float ] = __.absent,
121 request_timeout: __.Absential[ float ] = __.absent,
122 user_agent: __.Absential[ str ] = __.absent,
123 **base_initargs: __.typx.Any
124 ) -> None:
125 super( ).__init__( **base_initargs )
126 if not __.is_absent( entries_max ): self.entries_max = entries_max
127 if not __.is_absent( ttl ): self.ttl = ttl
128 if not __.is_absent( request_timeout ):
129 self.request_timeout = request_timeout
130 if not __.is_absent( user_agent ): self.user_agent = user_agent
131 self._cache: dict[ str, RobotsCacheEntry ] = { }
132 self._recency: __.collections.deque[ str ] = __.collections.deque( )
133 self._request_delays: dict[ str, float ] = { }
135 @classmethod
136 def from_configuration(
137 cls, configuration: __.cabc.Mapping[ str, __.typx.Any ]
138 ) -> __.typx.Self:
139 ''' Creates RobotsCache instance from application configuration. '''
140 cache_config = configuration.get( 'cache', { } )
141 robots_ttl = cache_config.get( 'robots-ttl', 3600.0 )
142 return cls( ttl = robots_ttl )
144 async def access(
145 self, client: _httpx.AsyncClient, domain: str, # TODO: retriever
146 ) -> _RobotFileParser:
147 ''' Retrieves cached robots.txt parser if valid. '''
148 if domain not in self._cache:
149 await _retrieve_robots_txt( client, self, domain )
150 entry = self._cache[ domain ]
151 if entry.invalid: 151 ↛ 152line 151 didn't jump to line 152 because the condition on line 151 was never true
152 self._remove( domain )
153 await _retrieve_robots_txt( client, self, domain )
154 entry = self._cache[ domain ]
155 self._record_access( domain )
156 return entry.response.extract( )
158 def assign_delay( self, domain: str, delay_seconds: float ) -> None:
159 ''' Sets next allowed request time for domain. '''
160 self._request_delays[ domain ] = __.time.time( ) + delay_seconds
162 def calculate_delay_remainder( self, domain: str ) -> float:
163 ''' Returns remaining crawl delay time for domain. '''
164 allow_at = self._request_delays.get( domain, 0.0 )
165 if not allow_at: return 0.0
166 remainder = allow_at - __.time.time( )
167 return max( 0.0, remainder )
169 def determine_ttl( self, response: RobotsResponse ) -> float:
170 ''' Determines appropriate TTL based on response type. '''
171 if response.is_value( ): return self.ttl
172 return self.error_ttl
174 async def store(
175 self, domain: str, response: RobotsResponse, ttl: float
176 ) -> None:
177 ''' Stores robots.txt parser in cache. '''
178 entry = RobotsCacheEntry(
179 response = response, timestamp = __.time.time( ), ttl = ttl )
180 self._cache[ domain ] = entry
181 self._record_access( domain )
182 self._evict_by_count( )
184 def _evict_by_count( self ) -> None:
185 ''' Evicts oldest entries when cache exceeds max size. '''
186 while (
187 len( self._cache ) > self.entries_max
188 and self._recency
189 ):
190 lru_domain = self._recency.popleft( )
191 if lru_domain in self._cache: # pragma: no branch
192 del self._cache[ lru_domain ]
194 def _record_access( self, domain: str ) -> None:
195 ''' Updates LRU access order for given domain. '''
196 with __.ctxl.suppress( ValueError ):
197 self._recency.remove( domain )
198 self._recency.append( domain )
200 def _remove( self, domain: str ) -> None:
201 ''' Removes entry from cache. '''
202 self._cache.pop( domain, None )
203 with __.ctxl.suppress( ValueError ):
204 self._recency.remove( domain )
207class ContentCache( Cache, instances_mutables = ( '_memory_total', ) ):
208 ''' Cache manager for URL content (GET requests) with memory tracking. '''
210 memory_max: int = 32 * 1024 * 1024
212 def __init__(
213 self, *,
214 robots_cache: __.Absential[ RobotsCache ] = __.absent,
215 memory_max: __.Absential[ int ] = __.absent,
216 **base_initargs: __.typx.Any
217 ) -> None:
218 super( ).__init__( **base_initargs )
219 if __.is_absent( robots_cache ):
220 self.robots_cache = RobotsCache( **base_initargs )
221 else: self.robots_cache = robots_cache
222 if not __.is_absent( memory_max ): self.memory_max = memory_max
223 self._cache: dict[ str, ContentCacheEntry ] = { }
224 self._memory_total = 0
225 self._recency: __.collections.deque[ str ] = __.collections.deque( )
227 @classmethod
228 def from_configuration(
229 cls,
230 configuration: __.cabc.Mapping[ str, __.typx.Any ],
231 robots_cache: __.Absential[ RobotsCache ] = __.absent
232 ) -> __.typx.Self:
233 ''' Creates ContentCache instance from application configuration. '''
234 cache_config = configuration.get( 'cache', { } )
235 content_ttl = cache_config.get( 'content-ttl', 300.0 )
236 memory_limit = cache_config.get( 'memory-limit', 33554432 )
237 nomargs = {
238 'success_ttl': content_ttl,
239 'memory_max': memory_limit,
240 }
241 if not __.is_absent( robots_cache ):
242 nomargs[ 'robots_cache' ] = robots_cache
243 return cls( **nomargs )
245 async def access(
246 self, url: str
247 ) -> __.Absential[ tuple[ bytes, _httpx.Headers ] ]:
248 ''' Retrieves cached content if valid. '''
249 if url not in self._cache: return __.absent
250 entry = self._cache[ url ]
251 if entry.invalid:
252 self._remove( url )
253 return __.absent
254 self._record_access( url )
255 return ( entry.response.extract( ), entry.headers )
257 def determine_ttl( self, response: ContentResponse ) -> float:
258 ''' Determines appropriate TTL based on response type. '''
259 if response.is_value( ):
260 return self.success_ttl
261 # TODO: Inspect exception type for more granular TTL
262 return self.error_ttl
264 async def retrieve_url(
265 self,
266 url: _Url, /, *,
267 duration_max: float = 30.0,
268 client_factory: HttpClientFactory = _httpx.AsyncClient,
269 ) -> bytes:
270 ''' Convenience method for retrieving URL content. '''
271 return await retrieve_url(
272 self, url,
273 duration_max = duration_max,
274 client_factory = client_factory )
276 async def store(
277 self, url: str, response: ContentResponse,
278 headers: _httpx.Headers, ttl: float
279 ) -> None:
280 ''' Stores content in cache with memory management. '''
281 size_bytes = self._calculate_response_size( response )
282 entry = ContentCacheEntry(
283 response = response,
284 headers = headers,
285 timestamp = __.time.time( ),
286 ttl = ttl,
287 size_bytes = size_bytes )
288 if old_entry := self._cache.get( url ):
289 self._memory_total -= old_entry.memory_usage
290 self._cache[ url ] = entry
291 self._memory_total += entry.memory_usage
292 self._record_access( url )
293 self._evict_by_memory( )
295 def _calculate_response_size( self, response: ContentResponse ) -> int:
296 ''' Calculates memory footprint of cached response. '''
297 if response.is_value( ):
298 content = response.extract( )
299 return len( content )
300 return 100 # Conservative estimate for exception overhead
302 def _evict_by_memory( self ) -> None:
303 ''' Evicts LRU entries until memory usage is under limit. '''
304 while (
305 self._memory_total > self.memory_max
306 and self._recency
307 ):
308 lru_url = self._recency.popleft( )
309 if lru_url in self._cache: # pragma: no branch
310 entry = self._cache[ lru_url ]
311 self._memory_total -= entry.memory_usage
312 del self._cache[ lru_url ]
313 _scribe.debug( f"Evicted cache entry: {lru_url}" )
315 def _record_access( self, url: str ) -> None:
316 ''' Updates LRU access order for given URL. '''
317 with __.ctxl.suppress( ValueError ):
318 self._recency.remove( url )
319 self._recency.append( url )
321 def _remove( self, url: str ) -> None:
322 ''' Removes entry from cache and updates memory tracking. '''
323 if entry := self._cache.pop( url, None ):
324 self._memory_total -= entry.memory_usage
325 with __.ctxl.suppress( ValueError ):
326 self._recency.remove( url )
329class ProbeCache( Cache ):
330 ''' Cache manager for URL probe results (HEAD requests). '''
332 entries_max: int = 1000
334 def __init__(
335 self, *,
336 robots_cache: __.Absential[ RobotsCache ] = __.absent,
337 entries_max: __.Absential[ int ] = __.absent,
338 **base_initargs: __.typx.Any
339 ) -> None:
340 super( ).__init__( **base_initargs )
341 if __.is_absent( robots_cache ):
342 self.robots_cache = RobotsCache( **base_initargs )
343 else: self.robots_cache = robots_cache
344 if not __.is_absent( entries_max ): self.entries_max = entries_max
345 self._cache: dict[ str, ProbeCacheEntry ] = { }
346 self._recency: __.collections.deque[ str ] = __.collections.deque( )
348 @classmethod
349 def from_configuration(
350 cls,
351 configuration: __.cabc.Mapping[ str, __.typx.Any ],
352 robots_cache: __.Absential[ RobotsCache ] = __.absent
353 ) -> __.typx.Self:
354 ''' Creates ProbeCache instance from application configuration. '''
355 cache_config = configuration.get( 'cache', { } )
356 probe_ttl = cache_config.get( 'probe-ttl', 300.0 )
357 nomargs = { 'success_ttl': probe_ttl }
358 if not __.is_absent( robots_cache ):
359 nomargs[ 'robots_cache' ] = robots_cache
360 return cls( **nomargs )
362 async def access( self, url: str ) -> __.Absential[ bool ]:
363 ''' Retrieves cached probe result if valid. '''
364 if url not in self._cache: return __.absent
365 entry = self._cache[ url ]
366 if entry.invalid:
367 self._remove( url )
368 return __.absent
369 self._record_access( url )
370 return entry.response.extract( )
372 def determine_ttl( self, response: ProbeResponse ) -> float:
373 ''' Determines appropriate TTL based on response type. '''
374 if response.is_value( ):
375 return self.success_ttl
376 # TODO: Inspect exception type for more granular TTL
377 return self.error_ttl
379 async def probe_url(
380 self,
381 url: _Url, /, *,
382 duration_max: float = 10.0,
383 client_factory: HttpClientFactory = _httpx.AsyncClient,
384 ) -> bool:
385 ''' Convenience method for probing URL existence. '''
386 return await probe_url(
387 self, url,
388 duration_max = duration_max,
389 client_factory = client_factory )
391 async def store(
392 self, url: str, response: ProbeResponse, ttl: float
393 ) -> None:
394 ''' Stores probe result in cache. '''
395 entry = ProbeCacheEntry(
396 response = response,
397 timestamp = __.time.time( ),
398 ttl = ttl )
399 self._cache[ url ] = entry
400 self._record_access( url )
401 self._evict_by_count( )
403 def _evict_by_count( self ) -> None:
404 ''' Evicts oldest entries when cache exceeds max size. '''
405 while (
406 len( self._cache ) > self.entries_max
407 and self._recency
408 ):
409 lru_url = self._recency.popleft( )
410 if lru_url in self._cache: # pragma: no branch
411 del self._cache[ lru_url ]
413 def _record_access( self, url: str ) -> None:
414 ''' Updates LRU access order for given URL. '''
415 with __.ctxl.suppress( ValueError ):
416 self._recency.remove( url )
417 self._recency.append( url )
419 def _remove( self, url: str ) -> None:
420 ''' Removes entry from cache. '''
421 self._cache.pop( url, None )
422 with __.ctxl.suppress( ValueError ):
423 self._recency.remove( url )
426_http_success_threshold = 400
429_scribe = __.acquire_scribe( __name__ )
432def prepare(
433 auxdata: __.Globals
434) -> tuple[ ContentCache, ProbeCache, RobotsCache ]:
435 ''' Prepares cache instances from configuration.
437 Returns cache instances constructed from application configuration.
438 '''
439 configuration = auxdata.configuration
440 robots_cache = RobotsCache.from_configuration( configuration )
441 return (
442 ContentCache.from_configuration( configuration, robots_cache ),
443 ProbeCache.from_configuration( configuration, robots_cache ),
444 robots_cache,
445 )
448async def probe_url(
449 cache: ProbeCache,
450 url: _Url, *,
451 duration_max: float = 10.0,
452 client_factory: HttpClientFactory = _httpx.AsyncClient,
453) -> bool:
454 ''' Cached HEAD request to check URL existence. '''
455 url_s = url.geturl( )
456 match url.scheme:
457 case '' | 'file':
458 return __.Path( url.path ).exists( )
459 case 'http' | 'https':
460 result = await cache.access( url_s )
461 if not __.is_absent( result ): return result
462 async with client_factory( ) as client:
463 result = await _probe_url(
464 url, duration_max = duration_max,
465 client = client,
466 probe_cache = cache,
467 robots_cache = cache.robots_cache )
468 ttl = cache.determine_ttl( result )
469 await cache.store( url_s, result, ttl )
470 return result.extract( )
471 case _: return False
474async def retrieve_url(
475 cache: ContentCache,
476 url: _Url, *,
477 duration_max: float = 30.0,
478 client_factory: HttpClientFactory = _httpx.AsyncClient,
479) -> bytes:
480 ''' Cached GET request to fetch URL content as bytes. '''
481 url_s = url.geturl( )
482 match url.scheme:
483 case '' | 'file':
484 location = __.Path( url.path )
485 try: return location.read_bytes( )
486 except Exception as exc:
487 raise _exceptions.DocumentationInaccessibility(
488 url_s, exc ) from exc
489 case 'http' | 'https':
490 result = await cache.access( url_s )
491 if not __.is_absent( result ):
492 content_bytes, _ = result
493 return content_bytes
494 async with client_factory( ) as client:
495 result, headers = await _retrieve_url(
496 url,
497 duration_max = duration_max,
498 client = client,
499 content_cache = cache,
500 robots_cache = cache.robots_cache )
501 ttl = cache.determine_ttl( result )
502 await cache.store( url_s, result, headers, ttl )
503 return result.extract( )
504 case _:
505 raise _exceptions.DocumentationInaccessibility(
506 url_s, f"Unsupported scheme: {url.scheme}" )
509async def retrieve_url_as_text(
510 cache: ContentCache,
511 url: _Url, *,
512 duration_max: float = 30.0,
513 charset_default: str = 'utf-8',
514 client_factory: HttpClientFactory = _httpx.AsyncClient,
515) -> str:
516 ''' Cached GET request to fetch URL content as text. '''
517 url_s = url.geturl( )
518 match url.scheme:
519 case '' | 'file':
520 location = __.Path( url.path )
521 try: content_bytes = location.read_bytes( )
522 except Exception as exc:
523 raise _exceptions.DocumentationInaccessibility(
524 url_s, exc ) from exc
525 _, charset = __.detext.detect_mimetype_and_charset(
526 content_bytes, location )
527 if not __.detext.is_textual_content( content_bytes ): 527 ↛ 528line 527 didn't jump to line 528 because the condition on line 527 was never true
528 raise _exceptions.DocumentationInaccessibility(
529 url_s, "Content analysis indicates non-textual data" )
530 encoding = charset or charset_default
531 return content_bytes.decode( encoding )
532 case 'http' | 'https':
533 result = await cache.access( url_s )
534 if not __.is_absent( result ):
535 content_bytes, headers = result
536 _validate_textual_content(
537 content_bytes, headers, url_s )
538 charset = _detect_charset_with_fallback(
539 content_bytes, headers, charset_default )
540 return content_bytes.decode( charset )
541 async with client_factory( ) as client:
542 result, headers = await _retrieve_url(
543 url, duration_max = duration_max,
544 client = client,
545 content_cache = cache,
546 robots_cache = cache.robots_cache )
547 ttl = cache.determine_ttl( result )
548 await cache.store( url_s, result, headers, ttl )
549 content_bytes = result.extract( )
550 _validate_textual_content(
551 content_bytes, headers, url_s )
552 charset = _detect_charset_with_fallback(
553 content_bytes, headers, charset_default )
554 return content_bytes.decode( charset )
555 case _:
556 raise _exceptions.DocumentationInaccessibility(
557 url_s, f"Unsupported scheme: {url.scheme}" )
560async def _apply_request_delay(
561 url: _Url,
562 client: _httpx.AsyncClient,
563 cache: RobotsCache,
564) -> None:
565 ''' Applies crawl delay to request if specified in robots.txt. '''
566 if url.scheme not in ( 'http', 'https' ): return 566 ↛ exitline 566 didn't return from function '_apply_request_delay' because the return on line 566 wasn't executed
567 domain = _extract_domain( url )
568 delay = cache.calculate_delay_remainder( domain )
569 if delay > 0: await cache.delay_function( delay )
570 try: parser = await cache.access( client, domain )
571 except _exceptions.RobotsTxtAccessFailure as exc:
572 _scribe.debug(
573 f"robots.txt access failed for {domain}: {exc.cause}. "
574 f"Skipping crawl delay application." )
575 return # Skip crawl delay when robots.txt unavailable
576 try: delay = parser.crawl_delay( cache.user_agent )
577 except Exception as exc:
578 _scribe.debug( f"Failed to get crawl delay for {domain}: {exc}" )
579 else:
580 if delay: cache.assign_delay( domain, float( delay ) )
583async def _cache_robots_txt_error(
584 domain: str, cache: RobotsCache, error: Exception
585) -> __.Absential[ _RobotFileParser ]:
586 _scribe.debug( f"Failed to fetch/parse robots.txt from {domain}: {error}" )
587 if isinstance( error, _exceptions.RobotsTxtAccessFailure ): 587 ↛ 588line 587 didn't jump to line 588 because the condition on line 587 was never true
588 result: RobotsResponse = _generics.Error( error )
589 else:
590 access_failure = _exceptions.RobotsTxtAccessFailure( domain, error )
591 result = _generics.Error( access_failure )
592 return await _cache_robots_txt_result( cache, domain, result )
595async def _cache_robots_txt_result(
596 cache: RobotsCache, domain: str, result: RobotsResponse
597) -> __.Absential[ _RobotFileParser ]:
598 ttl = cache.determine_ttl( result )
599 await cache.store( domain, result, ttl )
600 return result.extract( ) if result.is_value( ) else __.absent
603async def _check_robots_txt(
604 url: _Url, *,
605 client: _httpx.AsyncClient,
606 cache: RobotsCache,
607) -> bool:
608 ''' Checks if URL is allowed by robots.txt. '''
609 if url.scheme not in ( 'http', 'https' ): return True 609 ↛ exitline 609 didn't return from function '_check_robots_txt' because the return on line 609 wasn't executed
610 url_s = url.geturl( )
611 domain = _extract_domain( url )
612 try: parser = await cache.access( client, domain )
613 except _exceptions.RobotsTxtAccessFailure as exc:
614 _scribe.warning(
615 f"robots.txt access failed for {domain}: {exc.cause}. "
616 f"Proceeding without robots.txt validation." )
617 return True # Allow access when robots.txt unavailable
618 try: return parser.can_fetch( cache.user_agent, url_s )
619 except Exception as exc:
620 _scribe.debug( f"robots.txt check failed for {url_s}: {exc}" )
621 return True # if no robots.txt, then assume URL allowed
624def _detect_charset_with_fallback(
625 content: bytes, headers: _httpx.Headers, default: str
626) -> str:
627 ''' Detects charset from headers with content-based fallback. '''
628 header_charset = _extract_charset_from_headers( headers, '' )
629 if header_charset:
630 return header_charset
631 detected_charset = __.detext.detect_charset( content )
632 return detected_charset or default
635def _detect_mimetype_with_fallback(
636 content: bytes, headers: _httpx.Headers, url: str
637) -> str:
638 ''' Detects MIME type from headers with content-based fallback. '''
639 header_mimetype = _extract_mimetype_from_headers( headers )
640 if header_mimetype: 640 ↛ 642line 640 didn't jump to line 642 because the condition on line 640 was always true
641 return header_mimetype
642 return __.detext.detect_mimetype( content, url ) or ''
645def _extract_charset_from_headers(
646 headers: _httpx.Headers, default: str
647) -> str:
648 ''' Extracts charset from Content-Type header. '''
649 content_type = headers.get( 'content-type', '' )
650 if isinstance( content_type, str ) and ';' in content_type:
651 _, _, params = content_type.partition( ';' )
652 if 'charset=' in params:
653 charset = params.split( 'charset=' )[ -1 ].strip( )
654 return charset.strip( '"\\\'\"' )
655 return default
658def _extract_domain( url: _Url ) -> str:
659 ''' Extracts domain from URL for robots.txt caching. '''
660 return f"{url.scheme}://{url.netloc}"
663def _extract_mimetype_from_headers( headers: _httpx.Headers ) -> str:
664 ''' Extracts mimetype from Content-Type header. '''
665 content_type = headers.get( 'content-type', '' )
666 if isinstance( content_type, str ) and ';' in content_type:
667 mimetype, _, _ = content_type.partition( ';' )
668 return mimetype.strip( )
669 return content_type
672async def _probe_url(
673 url: _Url, /, *,
674 duration_max: float,
675 client: _httpx.AsyncClient,
676 probe_cache: ProbeCache,
677 robots_cache: RobotsCache,
678) -> ProbeResponse:
679 ''' Makes HEAD request with deduplication. '''
680 url_s = url.geturl( )
681 if not await _check_robots_txt( 681 ↛ 684line 681 didn't jump to line 684 because the condition on line 681 was never true
682 url, client = client, cache = robots_cache
683 ):
684 _scribe.debug( f"URL blocked by robots.txt: {url_s}" )
685 return _generics.Error( _exceptions.UrlImpermissibility(
686 url_s, robots_cache.user_agent ) )
687 await _apply_request_delay( url, cache = robots_cache, client = client )
688 async with probe_cache.acquire_mutex_for( url_s ):
689 try:
690 response = await client.head(
691 url_s, timeout = duration_max, follow_redirects = True )
692 except Exception as exc:
693 _scribe.debug( f"HEAD request failed for {url_s}: {exc}" )
694 return _generics.Error( exc )
695 else:
696 return _generics.Value(
697 response.status_code < _http_success_threshold )
700async def _retrieve_robots_txt(
701 client: _httpx.AsyncClient, cache: RobotsCache, domain: str
702) -> __.Absential[ _RobotFileParser ]:
703 ''' Fetches and parses robots.txt for domain. '''
704 robots_url = f"{domain}/robots.txt"
705 async with cache.acquire_mutex_for( domain ):
706 timeout = cache.request_timeout
707 try:
708 response = await client.get(
709 robots_url, timeout = timeout, follow_redirects = True )
710 except Exception as exc:
711 return await _cache_robots_txt_error( domain, cache, exc )
712 match response.status_code:
713 case _HttpStatus.OK: lines = response.text.splitlines( ) 713 ↛ 714line 713 didn't jump to line 714 because the pattern on line 713 always matched
714 case _HttpStatus.NOT_FOUND: lines = [ ]
715 case _:
716 try: response.raise_for_status( )
717 except Exception as exc:
718 return await _cache_robots_txt_error( domain, cache, exc )
719 robots_parser = _RobotFileParser( )
720 robots_parser.set_url( robots_url )
721 try: robots_parser.parse( lines )
722 except Exception as exc:
723 return await _cache_robots_txt_error( domain, cache, exc )
724 result: RobotsResponse = _generics.Value( robots_parser )
725 return await _cache_robots_txt_result( cache, domain, result )
728async def _retrieve_url(
729 url: _Url, /, *,
730 duration_max: float,
731 client: _httpx.AsyncClient,
732 content_cache: ContentCache,
733 robots_cache: RobotsCache,
734) -> tuple[ ContentResponse, _httpx.Headers ]:
735 ''' Makes GET request with deduplication. '''
736 url_s = url.geturl( )
737 if not await _check_robots_txt( 737 ↛ 740line 737 didn't jump to line 740 because the condition on line 737 was never true
738 url, cache = robots_cache, client = client
739 ):
740 return (
741 _generics.Error( _exceptions.UrlImpermissibility(
742 url_s, robots_cache.user_agent ) ),
743 _httpx.Headers( ) )
744 await _apply_request_delay( url, cache = robots_cache, client = client )
745 async with content_cache.acquire_mutex_for( url_s ):
746 try:
747 response = await client.get(
748 url_s, timeout = duration_max, follow_redirects = True )
749 response.raise_for_status( )
750 except Exception as exc:
751 _scribe.debug( f"GET request failed for {url_s}: {exc}" )
752 return _generics.Error( exc ), _httpx.Headers( )
753 else: return _generics.Value( response.content ), response.headers
756def _validate_textual_content(
757 content: bytes, headers: _httpx.Headers, url: str
758) -> None:
759 ''' Validates that content is textual via headers and content analysis. '''
760 mimetype = _detect_mimetype_with_fallback( content, headers, url )
761 if mimetype and not __.detext.is_textual_mimetype( mimetype ):
762 raise _exceptions.HttpContentTypeInvalidity(
763 url, mimetype, "text decoding" )
764 if not __.detext.is_textual_content( content ): 764 ↛ 765line 764 didn't jump to line 765 because the condition on line 764 was never true
765 raise _exceptions.HttpContentTypeInvalidity(
766 url, mimetype or 'unknown', "content analysis" )