Coverage for sources/librovore/cacheproxy.py: 87%
411 statements
« prev ^ index » next coverage.py v7.10.4, created at 2025-08-17 23:43 +0000
« prev ^ index » next coverage.py v7.10.4, created at 2025-08-17 23:43 +0000
1# vim: set filetype=python fileencoding=utf-8:
2# -*- coding: utf-8 -*-
4#============================================================================#
5# #
6# Licensed under the Apache License, Version 2.0 (the "License"); #
7# you may not use this file except in compliance with the License. #
8# You may obtain a copy of the License at #
9# #
10# http://www.apache.org/licenses/LICENSE-2.0 #
11# #
12# Unless required by applicable law or agreed to in writing, software #
13# distributed under the License is distributed on an "AS IS" BASIS, #
14# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. #
15# See the License for the specific language governing permissions and #
16# limitations under the License. #
17# #
18#============================================================================#
21''' HTTP cache for documentation URL access. '''
24from http import HTTPStatus as _HttpStatus
25from urllib.parse import ParseResult as _Url
26from urllib.robotparser import RobotFileParser as _RobotFileParser
28import appcore.generics as _generics
29import httpx as _httpx
31from . import __
32from . import exceptions as _exceptions
35HttpClientFactory: __.typx.TypeAlias = (
36 __.cabc.Callable[ [ ], _httpx.AsyncClient ] )
37ContentResponse: __.typx.TypeAlias = _generics.Result[ bytes, Exception ]
38ProbeResponse: __.typx.TypeAlias = _generics.Result[ bool, Exception ]
39RobotsResponse: __.typx.TypeAlias = (
40 _generics.Result[ _RobotFileParser, Exception ] )
43class CacheEntry( __.immut.DataclassObject ):
44 ''' Cache entry base. '''
46 timestamp: float
47 ttl: float
49 @property
50 def invalid( self ) -> bool:
51 ''' Checks if cache entry has exceeded its TTL. '''
52 return __.time.time( ) - self.timestamp > self.ttl
55class ContentCacheEntry( CacheEntry ):
56 ''' Cache entry for URL content with size tracking. '''
58 response: ContentResponse
59 headers: _httpx.Headers
60 size_bytes: int
62 @property
63 def memory_usage( self ) -> int:
64 ''' Calculates total memory usage including metadata. '''
65 return self.size_bytes + 100 # Overhead estimate
68class ProbeCacheEntry( CacheEntry ):
69 ''' Cache entry for URL probe results. '''
71 response: ProbeResponse
74class RobotsCacheEntry( CacheEntry ):
75 ''' Cache entry for robots.txt parser. '''
77 response: RobotsResponse
80class Cache( __.immut.Object ):
81 ''' Cache base with shared configuration attributes. '''
83 error_ttl: float = 30.0
84 success_ttl: float = 300.0
86 def __init__(
87 self, *,
88 error_ttl: __.Absential[ float ] = __.absent,
89 success_ttl: __.Absential[ float ] = __.absent,
90 delay_function: __.cabc.Callable[
91 [ float ], __.cabc.Awaitable[ None ]
92 ] = __.asyncio.sleep
93 ) -> None:
94 if not __.is_absent( error_ttl ): self.error_ttl = error_ttl
95 if not __.is_absent( success_ttl ): self.success_ttl = success_ttl
96 self.delay_function = delay_function
97 self._request_mutexes: dict[ str, __.asyncio.Lock ] = { }
99 @__.ctxl.asynccontextmanager
100 async def acquire_mutex_for( self, url: str ):
101 ''' Acquires mutex for HTTP request deduplication. '''
102 if url not in self._request_mutexes: # pragma: no branch
103 self._request_mutexes[ url ] = __.asyncio.Lock( )
104 mutex = self._request_mutexes[ url ]
105 async with mutex:
106 try: yield
107 finally: self._request_mutexes.pop( url, None )
110class RobotsCache( Cache ):
111 ''' Cache manager for robots.txt files with crawl delay tracking. '''
113 entries_max: int = 500
114 request_timeout: float = 5.0
115 ttl: float = 3600.0
116 user_agent: str = '*'
118 def __init__(
119 self, *,
120 entries_max: __.Absential[ int ] = __.absent,
121 ttl: __.Absential[ float ] = __.absent,
122 request_timeout: __.Absential[ float ] = __.absent,
123 user_agent: __.Absential[ str ] = __.absent,
124 **base_initargs: __.typx.Any
125 ) -> None:
126 super( ).__init__( **base_initargs )
127 if not __.is_absent( entries_max ): self.entries_max = entries_max
128 if not __.is_absent( ttl ): self.ttl = ttl
129 if not __.is_absent( request_timeout ):
130 self.request_timeout = request_timeout
131 if not __.is_absent( user_agent ): self.user_agent = user_agent
132 self._cache: dict[ str, RobotsCacheEntry ] = { }
133 self._recency: __.collections.deque[ str ] = __.collections.deque( )
134 self._request_delays: dict[ str, float ] = { }
136 @classmethod
137 def from_configuration(
138 cls, configuration: __.cabc.Mapping[ str, __.typx.Any ]
139 ) -> __.typx.Self:
140 ''' Creates RobotsCache instance from application configuration. '''
141 cache_config = configuration.get( 'cache', { } )
142 robots_ttl = cache_config.get( 'robots-ttl', 3600.0 )
143 return cls( ttl = robots_ttl )
145 async def access( self, domain: str ) -> __.Absential[ _RobotFileParser ]:
146 ''' Retrieves cached robots.txt parser if valid. '''
147 if domain not in self._cache: return __.absent
148 entry = self._cache[ domain ]
149 if entry.invalid:
150 self._remove( domain )
151 return __.absent
152 self._record_access( domain )
153 return entry.response.extract( )
155 def assign_delay( self, domain: str, delay_seconds: float ) -> None:
156 ''' Sets next allowed request time for domain. '''
157 self._request_delays[ domain ] = __.time.time( ) + delay_seconds
159 def calculate_delay_remainder( self, domain: str ) -> float:
160 ''' Returns remaining crawl delay time for domain. '''
161 allow_at = self._request_delays.get( domain, 0.0 )
162 if not allow_at: return 0.0
163 remainder = allow_at - __.time.time( )
164 return max( 0.0, remainder )
166 def determine_ttl( self, response: RobotsResponse ) -> float:
167 ''' Determines appropriate TTL based on response type. '''
168 if response.is_value( ): return self.ttl
169 return self.error_ttl
171 async def store(
172 self, domain: str, response: RobotsResponse, ttl: float
173 ) -> None:
174 ''' Stores robots.txt parser in cache. '''
175 entry = RobotsCacheEntry(
176 response = response, timestamp = __.time.time( ), ttl = ttl )
177 self._cache[ domain ] = entry
178 self._record_access( domain )
179 self._evict_by_count( )
181 def _evict_by_count( self ) -> None:
182 ''' Evicts oldest entries when cache exceeds max size. '''
183 while (
184 len( self._cache ) > self.entries_max
185 and self._recency
186 ):
187 lru_domain = self._recency.popleft( )
188 if lru_domain in self._cache: # pragma: no branch
189 del self._cache[ lru_domain ]
191 def _record_access( self, domain: str ) -> None:
192 ''' Updates LRU access order for given domain. '''
193 with __.ctxl.suppress( ValueError ):
194 self._recency.remove( domain )
195 self._recency.append( domain )
197 def _remove( self, domain: str ) -> None:
198 ''' Removes entry from cache. '''
199 self._cache.pop( domain, None )
200 with __.ctxl.suppress( ValueError ):
201 self._recency.remove( domain )
204class ContentCache( Cache, instances_mutables = ( '_memory_total', ) ):
205 ''' Cache manager for URL content (GET requests) with memory tracking. '''
207 memory_max: int = 32 * 1024 * 1024
209 def __init__(
210 self, *,
211 robots_cache: __.Absential[ RobotsCache ] = __.absent,
212 memory_max: __.Absential[ int ] = __.absent,
213 **base_initargs: __.typx.Any
214 ) -> None:
215 super( ).__init__( **base_initargs )
216 if __.is_absent( robots_cache ):
217 self.robots_cache = RobotsCache( **base_initargs )
218 else: self.robots_cache = robots_cache
219 if not __.is_absent( memory_max ): self.memory_max = memory_max
220 self._cache: dict[ str, ContentCacheEntry ] = { }
221 self._memory_total = 0
222 self._recency: __.collections.deque[ str ] = __.collections.deque( )
224 @classmethod
225 def from_configuration(
226 cls,
227 configuration: __.cabc.Mapping[ str, __.typx.Any ],
228 robots_cache: __.Absential[ RobotsCache ] = __.absent
229 ) -> __.typx.Self:
230 ''' Creates ContentCache instance from application configuration. '''
231 cache_config = configuration.get( 'cache', { } )
232 content_ttl = cache_config.get( 'content-ttl', 300.0 )
233 memory_limit = cache_config.get( 'memory-limit', 33554432 )
234 nomargs = {
235 'success_ttl': content_ttl,
236 'memory_max': memory_limit,
237 }
238 if not __.is_absent( robots_cache ):
239 nomargs[ 'robots_cache' ] = robots_cache
240 return cls( **nomargs )
242 async def access(
243 self, url: str
244 ) -> __.Absential[ tuple[ bytes, _httpx.Headers ] ]:
245 ''' Retrieves cached content if valid. '''
246 if url not in self._cache: return __.absent
247 entry = self._cache[ url ]
248 if entry.invalid:
249 self._remove( url )
250 return __.absent
251 self._record_access( url )
252 return ( entry.response.extract( ), entry.headers )
254 def determine_ttl( self, response: ContentResponse ) -> float:
255 ''' Determines appropriate TTL based on response type. '''
256 if response.is_value( ):
257 return self.success_ttl
258 # TODO: Inspect exception type for more granular TTL
259 return self.error_ttl
261 async def retrieve_url(
262 self,
263 url: _Url, /, *,
264 duration_max: float = 30.0,
265 client_factory: HttpClientFactory = _httpx.AsyncClient,
266 ) -> bytes:
267 ''' Convenience method for retrieving URL content. '''
268 return await retrieve_url(
269 self, url,
270 duration_max = duration_max,
271 client_factory = client_factory )
273 async def store(
274 self, url: str, response: ContentResponse,
275 headers: _httpx.Headers, ttl: float
276 ) -> None:
277 ''' Stores content in cache with memory management. '''
278 size_bytes = self._calculate_response_size( response )
279 entry = ContentCacheEntry(
280 response = response,
281 headers = headers,
282 timestamp = __.time.time( ),
283 ttl = ttl,
284 size_bytes = size_bytes )
285 if old_entry := self._cache.get( url ):
286 self._memory_total -= old_entry.memory_usage
287 self._cache[ url ] = entry
288 self._memory_total += entry.memory_usage
289 self._record_access( url )
290 self._evict_by_memory( )
292 def _calculate_response_size( self, response: ContentResponse ) -> int:
293 ''' Calculates memory footprint of cached response. '''
294 if response.is_value( ):
295 content = response.extract( )
296 return len( content )
297 return 100 # Conservative estimate for exception overhead
299 def _evict_by_memory( self ) -> None:
300 ''' Evicts LRU entries until memory usage is under limit. '''
301 while (
302 self._memory_total > self.memory_max
303 and self._recency
304 ):
305 lru_url = self._recency.popleft( )
306 if lru_url in self._cache: # pragma: no branch
307 entry = self._cache[ lru_url ]
308 self._memory_total -= entry.memory_usage
309 del self._cache[ lru_url ]
310 _scribe.debug( f"Evicted cache entry: {lru_url}" )
312 def _record_access( self, url: str ) -> None:
313 ''' Updates LRU access order for given URL. '''
314 with __.ctxl.suppress( ValueError ):
315 self._recency.remove( url )
316 self._recency.append( url )
318 def _remove( self, url: str ) -> None:
319 ''' Removes entry from cache and updates memory tracking. '''
320 if entry := self._cache.pop( url, None ):
321 self._memory_total -= entry.memory_usage
322 with __.ctxl.suppress( ValueError ):
323 self._recency.remove( url )
326class ProbeCache( Cache ):
327 ''' Cache manager for URL probe results (HEAD requests). '''
329 entries_max: int = 1000
331 def __init__(
332 self, *,
333 robots_cache: __.Absential[ RobotsCache ] = __.absent,
334 entries_max: __.Absential[ int ] = __.absent,
335 **base_initargs: __.typx.Any
336 ) -> None:
337 super( ).__init__( **base_initargs )
338 if __.is_absent( robots_cache ):
339 self.robots_cache = RobotsCache( **base_initargs )
340 else: self.robots_cache = robots_cache
341 if not __.is_absent( entries_max ): self.entries_max = entries_max
342 self._cache: dict[ str, ProbeCacheEntry ] = { }
343 self._recency: __.collections.deque[ str ] = __.collections.deque( )
345 @classmethod
346 def from_configuration(
347 cls,
348 configuration: __.cabc.Mapping[ str, __.typx.Any ],
349 robots_cache: __.Absential[ RobotsCache ] = __.absent
350 ) -> __.typx.Self:
351 ''' Creates ProbeCache instance from application configuration. '''
352 cache_config = configuration.get( 'cache', { } )
353 probe_ttl = cache_config.get( 'probe-ttl', 300.0 )
354 nomargs = { 'success_ttl': probe_ttl }
355 if not __.is_absent( robots_cache ):
356 nomargs[ 'robots_cache' ] = robots_cache
357 return cls( **nomargs )
359 async def access( self, url: str ) -> __.Absential[ bool ]:
360 ''' Retrieves cached probe result if valid. '''
361 if url not in self._cache: return __.absent
362 entry = self._cache[ url ]
363 if entry.invalid:
364 self._remove( url )
365 return __.absent
366 self._record_access( url )
367 return entry.response.extract( )
369 def determine_ttl( self, response: ProbeResponse ) -> float:
370 ''' Determines appropriate TTL based on response type. '''
371 if response.is_value( ):
372 return self.success_ttl
373 # TODO: Inspect exception type for more granular TTL
374 return self.error_ttl
376 async def probe_url(
377 self,
378 url: _Url, /, *,
379 duration_max: float = 10.0,
380 client_factory: HttpClientFactory = _httpx.AsyncClient,
381 ) -> bool:
382 ''' Convenience method for probing URL existence. '''
383 return await probe_url(
384 self, url,
385 duration_max = duration_max,
386 client_factory = client_factory )
388 async def store(
389 self, url: str, response: ProbeResponse, ttl: float
390 ) -> None:
391 ''' Stores probe result in cache. '''
392 entry = ProbeCacheEntry(
393 response = response,
394 timestamp = __.time.time( ),
395 ttl = ttl )
396 self._cache[ url ] = entry
397 self._record_access( url )
398 self._evict_by_count( )
400 def _evict_by_count( self ) -> None:
401 ''' Evicts oldest entries when cache exceeds max size. '''
402 while (
403 len( self._cache ) > self.entries_max
404 and self._recency
405 ):
406 lru_url = self._recency.popleft( )
407 if lru_url in self._cache: # pragma: no branch
408 del self._cache[ lru_url ]
410 def _record_access( self, url: str ) -> None:
411 ''' Updates LRU access order for given URL. '''
412 with __.ctxl.suppress( ValueError ):
413 self._recency.remove( url )
414 self._recency.append( url )
416 def _remove( self, url: str ) -> None:
417 ''' Removes entry from cache. '''
418 self._cache.pop( url, None )
419 with __.ctxl.suppress( ValueError ):
420 self._recency.remove( url )
423_http_success_threshold = 400
426class CacheContext( __.immut.DataclassObject ):
427 ''' Context carrying configured cache instances. '''
429 content_cache: ContentCache
430 probe_cache: ProbeCache
431 robots_cache: RobotsCache
433 @classmethod
434 def from_configuration(
435 cls,
436 configuration: __.cabc.Mapping[ str, __.typx.Any ]
437 ) -> __.typx.Self:
438 ''' Creates cache context from application configuration. '''
439 robots_cache = RobotsCache.from_configuration( configuration )
440 return cls(
441 content_cache = ContentCache.from_configuration(
442 configuration, robots_cache ),
443 probe_cache = ProbeCache.from_configuration(
444 configuration, robots_cache ),
445 robots_cache = robots_cache,
446 )
449_scribe = __.acquire_scribe( __name__ )
452def prepare(
453 auxdata: __.Globals
454) -> tuple[ ContentCache, ProbeCache, RobotsCache ]:
455 ''' Prepares cache instances from configuration.
457 Returns cache instances constructed from application configuration.
458 '''
459 configuration = auxdata.configuration
460 robots_cache = RobotsCache.from_configuration( configuration )
461 return (
462 ContentCache.from_configuration( configuration, robots_cache ),
463 ProbeCache.from_configuration( configuration, robots_cache ),
464 robots_cache,
465 )
468async def probe_url(
469 cache: ProbeCache,
470 url: _Url, *,
471 duration_max: float = 10.0,
472 client_factory: HttpClientFactory = _httpx.AsyncClient,
473) -> bool:
474 ''' Cached HEAD request to check URL existence. '''
475 url_s = url.geturl( )
476 match url.scheme:
477 case '' | 'file':
478 return __.Path( url.path ).exists( )
479 case 'http' | 'https':
480 result = await cache.access( url_s )
481 if not __.is_absent( result ): return result
482 async with client_factory( ) as client:
483 result = await _probe_url(
484 url, duration_max = duration_max,
485 client = client,
486 probe_cache = cache,
487 robots_cache = cache.robots_cache )
488 ttl = cache.determine_ttl( result )
489 await cache.store( url_s, result, ttl )
490 return result.extract( )
491 case _: return False
494async def retrieve_url(
495 cache: ContentCache,
496 url: _Url, *,
497 duration_max: float = 30.0,
498 client_factory: HttpClientFactory = _httpx.AsyncClient,
499) -> bytes:
500 ''' Cached GET request to fetch URL content as bytes. '''
501 url_s = url.geturl( )
502 match url.scheme:
503 case '' | 'file':
504 location = __.Path( url.path )
505 try: return location.read_bytes( )
506 except Exception as exc:
507 raise _exceptions.DocumentationInaccessibility(
508 url_s, exc ) from exc
509 case 'http' | 'https':
510 result = await cache.access( url_s )
511 if not __.is_absent( result ):
512 content_bytes, _ = result
513 return content_bytes
514 async with client_factory( ) as client:
515 result, headers = await _retrieve_url(
516 url,
517 duration_max = duration_max,
518 client = client,
519 content_cache = cache,
520 robots_cache = cache.robots_cache )
521 ttl = cache.determine_ttl( result )
522 await cache.store( url_s, result, headers, ttl )
523 return result.extract( )
524 case _:
525 raise _exceptions.DocumentationInaccessibility(
526 url_s, f"Unsupported scheme: {url.scheme}" )
529async def retrieve_url_as_text(
530 cache: ContentCache,
531 url: _Url, *,
532 duration_max: float = 30.0,
533 charset_default: str = 'utf-8',
534 client_factory: HttpClientFactory = _httpx.AsyncClient,
535) -> str:
536 ''' Cached GET request to fetch URL content as text. '''
537 url_s = url.geturl( )
538 match url.scheme:
539 case '' | 'file':
540 location = __.Path( url.path )
541 try: content_bytes = location.read_bytes( )
542 except Exception as exc:
543 raise _exceptions.DocumentationInaccessibility(
544 url_s, exc ) from exc
545 mimetype, charset = __.detext.detect_mimetype_and_charset(
546 content_bytes, location )
547 if not __.detext.is_textual_content( content_bytes ): 547 ↛ 548line 547 didn't jump to line 548 because the condition on line 547 was never true
548 raise _exceptions.DocumentationInaccessibility(
549 url_s, "Content analysis indicates non-textual data" )
550 encoding = charset or charset_default
551 return content_bytes.decode( encoding )
552 case 'http' | 'https':
553 result = await cache.access( url_s )
554 if not __.is_absent( result ):
555 content_bytes, headers = result
556 _validate_textual_content(
557 content_bytes, headers, url_s )
558 charset = _detect_charset_with_fallback(
559 content_bytes, headers, charset_default )
560 return content_bytes.decode( charset )
561 async with client_factory( ) as client:
562 result, headers = await _retrieve_url(
563 url, duration_max = duration_max,
564 client = client,
565 content_cache = cache,
566 robots_cache = cache.robots_cache )
567 ttl = cache.determine_ttl( result )
568 await cache.store( url_s, result, headers, ttl )
569 content_bytes = result.extract( )
570 _validate_textual_content(
571 content_bytes, headers, url_s )
572 charset = _detect_charset_with_fallback(
573 content_bytes, headers, charset_default )
574 return content_bytes.decode( charset )
575 case _:
576 raise _exceptions.DocumentationInaccessibility(
577 url_s, f"Unsupported scheme: {url.scheme}" )
580async def _apply_request_delay(
581 url: _Url,
582 client: _httpx.AsyncClient,
583 cache: RobotsCache,
584) -> None:
585 ''' Applies crawl delay to request if specified in robots.txt. '''
586 if url.scheme not in ( 'http', 'https' ): return 586 ↛ exitline 586 didn't return from function '_apply_request_delay' because the return on line 586 wasn't executed
587 domain = _extract_domain( url )
588 delay = cache.calculate_delay_remainder( domain )
589 if delay > 0: await cache.delay_function( delay )
590 parser = await cache.access( domain )
591 if __.is_absent( parser ): 591 ↛ 592line 591 didn't jump to line 592 because the condition on line 591 was never true
592 parser = await _retrieve_robots_txt( client, cache, domain )
593 if not __.is_absent( parser ): 593 ↛ exitline 593 didn't return from function '_apply_request_delay' because the condition on line 593 was always true
594 try: delay = parser.crawl_delay( cache.user_agent )
595 except Exception as exc:
596 _scribe.debug( f"Failed to get crawl delay for {domain}: {exc}" )
597 else:
598 if delay: cache.assign_delay( domain, float( delay ) )
601async def _cache_robots_txt_error(
602 domain: str, cache: RobotsCache, error: Exception
603) -> __.Absential[ _RobotFileParser ]:
604 _scribe.debug( f"Failed to fetch/parse robots.txt from {domain}: {error}" )
605 result: RobotsResponse = _generics.Error( error )
606 return await _cache_robots_txt_result( cache, domain, result )
609async def _cache_robots_txt_result(
610 cache: RobotsCache, domain: str, result: RobotsResponse
611) -> __.Absential[ _RobotFileParser ]:
612 ttl = cache.determine_ttl( result )
613 await cache.store( domain, result, ttl )
614 return result.extract( ) if result.is_value( ) else __.absent
617async def _check_robots_txt(
618 url: _Url, *,
619 client: _httpx.AsyncClient,
620 cache: RobotsCache,
621) -> bool:
622 ''' Checks if URL is allowed by robots.txt. '''
623 if url.scheme not in ( 'http', 'https' ): return True 623 ↛ exitline 623 didn't return from function '_check_robots_txt' because the return on line 623 wasn't executed
624 url_s = url.geturl( )
625 domain = _extract_domain( url )
626 parser = await cache.access( domain )
627 if __.is_absent( parser ): 627 ↛ 630line 627 didn't jump to line 630 because the condition on line 627 was always true
628 parser = await _retrieve_robots_txt( client, cache, domain )
629 if __.is_absent( parser ): return True
630 try: return parser.can_fetch( cache.user_agent, url_s )
631 except Exception as exc:
632 _scribe.debug( f"robots.txt check failed for {url_s}: {exc}" )
633 return True # if no robots.txt, then assume URL allowed
636def _detect_charset_with_fallback(
637 content: bytes, headers: _httpx.Headers, default: str
638) -> str:
639 ''' Detects charset from headers with content-based fallback. '''
640 header_charset = _extract_charset_from_headers( headers, '' )
641 if header_charset:
642 return header_charset
643 detected_charset = __.detext.detect_charset( content )
644 return detected_charset or default
647def _detect_mimetype_with_fallback(
648 content: bytes, headers: _httpx.Headers, url: str
649) -> str:
650 ''' Detects MIME type from headers with content-based fallback. '''
651 header_mimetype = _extract_mimetype_from_headers( headers )
652 if header_mimetype: 652 ↛ 654line 652 didn't jump to line 654 because the condition on line 652 was always true
653 return header_mimetype
654 return __.detext.detect_mimetype( content, url ) or ''
657def _extract_charset_from_headers(
658 headers: _httpx.Headers, default: str
659) -> str:
660 ''' Extracts charset from Content-Type header. '''
661 content_type = headers.get( 'content-type', '' )
662 if isinstance( content_type, str ) and ';' in content_type:
663 _, _, params = content_type.partition( ';' )
664 if 'charset=' in params:
665 charset = params.split( 'charset=' )[ -1 ].strip( )
666 return charset.strip( '"\\\'\"' )
667 return default
670def _extract_domain( url: _Url ) -> str:
671 ''' Extracts domain from URL for robots.txt caching. '''
672 return f"{url.scheme}://{url.netloc}"
675def _extract_mimetype_from_headers( headers: _httpx.Headers ) -> str:
676 ''' Extracts mimetype from Content-Type header. '''
677 content_type = headers.get( 'content-type', '' )
678 if isinstance( content_type, str ) and ';' in content_type:
679 mimetype, _, _ = content_type.partition( ';' )
680 return mimetype.strip( )
681 return content_type
684def _raise_non_textual_content( url: str ) -> None:
685 ''' Raises exception for non-textual content. '''
686 raise _exceptions.DocumentationInaccessibility(
687 url, "Content analysis indicates non-textual data" )
690def _raise_non_textual_mimetype( url: str, mimetype: str ) -> None:
691 ''' Raises exception for non-textual MIME type. '''
692 raise _exceptions.DocumentationInaccessibility(
693 url, f"Non-textual content detected: {mimetype}" )
697async def _probe_url(
698 url: _Url, /, *,
699 duration_max: float,
700 client: _httpx.AsyncClient,
701 probe_cache: ProbeCache,
702 robots_cache: RobotsCache,
703) -> ProbeResponse:
704 ''' Makes HEAD request with deduplication. '''
705 url_s = url.geturl( )
706 if not await _check_robots_txt( 706 ↛ 709line 706 didn't jump to line 709 because the condition on line 706 was never true
707 url, client = client, cache = robots_cache
708 ):
709 _scribe.debug( f"URL blocked by robots.txt: {url_s}" )
710 return _generics.Error( _exceptions.UrlImpermissibility(
711 url_s, robots_cache.user_agent ) )
712 await _apply_request_delay( url, cache = robots_cache, client = client )
713 async with probe_cache.acquire_mutex_for( url_s ):
714 try:
715 response = await client.head(
716 url_s, timeout = duration_max, follow_redirects = True )
717 except Exception as exc:
718 _scribe.debug( f"HEAD request failed for {url_s}: {exc}" )
719 return _generics.Error( exc )
720 else:
721 return _generics.Value(
722 response.status_code < _http_success_threshold )
725async def _retrieve_robots_txt(
726 client: _httpx.AsyncClient, cache: RobotsCache, domain: str
727) -> __.Absential[ _RobotFileParser ]:
728 ''' Fetches and parses robots.txt for domain. '''
729 robots_url = f"{domain}/robots.txt"
730 async with cache.acquire_mutex_for( domain ):
731 timeout = cache.request_timeout
732 try:
733 response = await client.get(
734 robots_url, timeout = timeout, follow_redirects = True )
735 except Exception as exc:
736 return await _cache_robots_txt_error( domain, cache, exc )
737 match response.status_code:
738 case _HttpStatus.OK: lines = response.text.splitlines( ) 738 ↛ 739line 738 didn't jump to line 739 because the pattern on line 738 always matched
739 case _HttpStatus.NOT_FOUND: lines = [ ]
740 case _:
741 try: response.raise_for_status( )
742 except Exception as exc:
743 return await _cache_robots_txt_error( domain, cache, exc )
744 robots_parser = _RobotFileParser( )
745 robots_parser.set_url( robots_url )
746 try: robots_parser.parse( lines )
747 except Exception as exc:
748 return await _cache_robots_txt_error( domain, cache, exc )
749 result: RobotsResponse = _generics.Value( robots_parser )
750 return await _cache_robots_txt_result( cache, domain, result )
753async def _retrieve_url(
754 url: _Url, /, *,
755 duration_max: float,
756 client: _httpx.AsyncClient,
757 content_cache: ContentCache,
758 robots_cache: RobotsCache,
759) -> tuple[ ContentResponse, _httpx.Headers ]:
760 ''' Makes GET request with deduplication. '''
761 url_s = url.geturl( )
762 if not await _check_robots_txt( 762 ↛ 765line 762 didn't jump to line 765 because the condition on line 762 was never true
763 url, cache = robots_cache, client = client
764 ):
765 return (
766 _generics.Error( _exceptions.UrlImpermissibility(
767 url_s, robots_cache.user_agent ) ),
768 _httpx.Headers( ) )
769 await _apply_request_delay( url, cache = robots_cache, client = client )
770 async with content_cache.acquire_mutex_for( url_s ):
771 try:
772 response = await client.get(
773 url_s, timeout = duration_max, follow_redirects = True )
774 response.raise_for_status( )
775 except Exception as exc:
776 _scribe.debug( f"GET request failed for {url_s}: {exc}" )
777 return _generics.Error( exc ), _httpx.Headers( )
778 else: return _generics.Value( response.content ), response.headers
781def _validate_textual_content(
782 content: bytes, headers: _httpx.Headers, url: str
783) -> None:
784 ''' Validates that content is textual via headers and content analysis. '''
785 mimetype = _detect_mimetype_with_fallback( content, headers, url )
786 if mimetype and not __.detext.is_textual_mimetype( mimetype ):
787 raise _exceptions.HttpContentTypeInvalidity(
788 url, mimetype, "text decoding" )
789 if not __.detext.is_textual_content( content ): 789 ↛ 790line 789 didn't jump to line 790 because the condition on line 789 was never true
790 raise _exceptions.HttpContentTypeInvalidity(
791 url, mimetype or 'unknown', "content analysis" )