Coverage for sources/librovore/urlpatterns.py: 70%
97 statements
« prev ^ index » next coverage.py v7.11.0, created at 2025-10-20 18:40 +0000
« prev ^ index » next coverage.py v7.11.0, created at 2025-10-20 18:40 +0000
1# vim: set filetype=python fileencoding=utf-8:
2# -*- coding: utf-8 -*-
4#============================================================================#
5# #
6# Licensed under the Apache License, Version 2.0 (the "License"); #
7# you may not use this file except in compliance with the License. #
8# You may obtain a copy of the License at #
9# #
10# http://www.apache.org/licenses/LICENSE-2.0 #
11# #
12# Unless required by applicable law or agreed to in writing, software #
13# distributed under the License is distributed on an "AS IS" BASIS, #
14# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. #
15# See the License for the specific language governing permissions and #
16# limitations under the License. #
17# #
18#============================================================================#
21''' URL pattern analysis and extension utilities for documentation. '''
24from . import __
25from . import cacheproxy as _cacheproxy
26from . import state as _state
28_Url = __.urlparse.ParseResult
31_scribe = __.acquire_scribe( __name__ )
34UrlPatternResult: __.typx.TypeAlias = __.immut.Dictionary[ str, __.typx.Any ]
37class UrlPatternAnalysis( __.immut.DataclassObject ):
38 ''' Analysis of documentation site URL patterns. '''
40 base_url: _Url
41 site_type: str
42 candidate_patterns: tuple[ str, ... ]
43 has_version_segment: bool
45 @classmethod
46 def from_url( cls, url: _Url ) -> __.typx.Self:
47 ''' Analyzes URL to determine documentation site pattern. '''
48 site_type = detect_documentation_site_type( url )
49 candidate_patterns = produce_url_patterns( url, site_type )
50 has_version_segment = detect_version_segment( url )
51 return cls(
52 base_url = url,
53 site_type = site_type,
54 candidate_patterns = candidate_patterns,
55 has_version_segment = has_version_segment,
56 )
59def detect_documentation_site_type( url: _Url ) -> str:
60 ''' Detects documentation hosting platform from URL. '''
61 netloc = url.netloc.lower( )
62 if 'readthedocs.io' in netloc or 'readthedocs.org' in netloc: 62 ↛ 63line 62 didn't jump to line 63 because the condition on line 62 was never true
63 return 'readthedocs'
64 if 'github.io' in netloc: 64 ↛ 65line 64 didn't jump to line 65 because the condition on line 64 was never true
65 return 'github_pages'
66 if 'gitlab.io' in netloc: 66 ↛ 67line 66 didn't jump to line 67 because the condition on line 66 was never true
67 return 'gitlab_pages'
68 if 'netlify.app' in netloc or 'netlify.com' in netloc: 68 ↛ 69line 68 didn't jump to line 69 because the condition on line 68 was never true
69 return 'netlify'
70 if 'vercel.app' in netloc: 70 ↛ 71line 70 didn't jump to line 71 because the condition on line 70 was never true
71 return 'vercel'
72 return 'generic'
75def detect_version_segment( url: _Url ) -> bool:
76 ''' Detects if URL contains version-like path segments. '''
77 path_segments = [
78 segment for segment in url.path.split( '/' ) if segment
79 ]
80 return any( _is_version_like( segment ) for segment in path_segments )
83def normalize_url_for_patterns( url: _Url ) -> _Url:
84 ''' Normalizes URL by removing trailing slashes and fragments. '''
85 path = url.path.rstrip( '/' ) if url.path != '/' else ''
86 return url._replace( path = path, fragment = '', query = '' )
89def produce_url_patterns( url: _Url, site_type: str ) -> tuple[ str, ... ]:
90 ''' Produces candidate URL patterns for documentation discovery. '''
91 normalized_url = normalize_url_for_patterns( url )
92 patterns: list[ str ] = [ ]
93 match site_type:
94 case 'readthedocs': 94 ↛ 95line 94 didn't jump to line 95 because the pattern on line 94 never matched
95 patterns.extend( _produce_readthedocs_patterns( normalized_url ) )
96 case 'github_pages': 96 ↛ 97line 96 didn't jump to line 97 because the pattern on line 96 never matched
97 patterns.extend( _produce_github_pages_patterns( normalized_url ) )
98 case 'gitlab_pages': 98 ↛ 99line 98 didn't jump to line 99 because the pattern on line 98 never matched
99 patterns.extend( _produce_gitlab_pages_patterns( normalized_url ) )
100 case _:
101 patterns.extend( _produce_generic_patterns( normalized_url ) )
102 seen: set[ str ] = set( )
103 deduplicated: list[ str ] = [ ]
104 for pattern in patterns:
105 if pattern not in seen: 105 ↛ 104line 105 didn't jump to line 104 because the condition on line 105 was always true
106 seen.add( pattern )
107 deduplicated.append( pattern )
108 return tuple( deduplicated )
111async def probe_url_patterns(
112 auxdata: _state.Globals,
113 base_url: _Url,
114 inventory_path: str
115) -> __.Absential[ _Url ]:
116 ''' Probes URL patterns to find working inventory URL. '''
117 analysis = UrlPatternAnalysis.from_url( base_url )
118 tasks: list[ __.cabc.Awaitable[ bool ] ] = [
119 _cacheproxy.probe_url(
120 auxdata.probe_cache,
121 __.urlparse.urlparse( __.urlparse.urlunparse( (
122 candidate_url.scheme,
123 candidate_url.netloc,
124 candidate_url.path + inventory_path,
125 candidate_url.params,
126 candidate_url.query,
127 candidate_url.fragment
128 ) ) ) )
129 for pattern in analysis.candidate_patterns
130 for candidate_url in [ __.urlparse.urlparse( pattern ) ]
131 ]
132 results = await __.asyncf.gather_async(
133 *tasks, return_exceptions = True )
134 for i, result in enumerate( results ):
135 if __.generics.is_value( result ) and result.value: 135 ↛ 136line 135 didn't jump to line 136 because the condition on line 135 was never true
136 pattern = analysis.candidate_patterns[ i ]
137 return __.urlparse.urlparse( pattern )
138 return __.absent
141def _is_version_like( segment: str ) -> bool:
142 ''' Checks if path segment looks like a version identifier. '''
143 version_patterns = [
144 'latest', 'stable', 'main', 'master', 'dev', 'development',
145 'v1', 'v2', 'v3', 'v4', 'v5',
146 'en', 'docs',
147 ]
148 segment_lower = segment.lower( )
149 if segment_lower in version_patterns: 149 ↛ 150line 149 didn't jump to line 150 because the condition on line 149 was never true
150 return True
151 return bool( _matches_version_pattern( segment ) )
154def _matches_version_pattern( segment: str ) -> bool:
155 ''' Checks if segment matches common version patterns. '''
156 version_regex = __.re.compile(
157 r'^v?\d+(\.\d+)*([a-z]\d*)?$', __.re.IGNORECASE )
158 return bool( version_regex.match( segment ) )
161def _produce_generic_patterns( url: _Url ) -> list[ str ]:
162 ''' Produces generic documentation URL patterns. '''
163 base_url = url.geturl( )
164 patterns = [ base_url ]
165 paths = [
166 '/en/latest',
167 '/latest',
168 '/docs',
169 '/documentation',
170 '/en/stable',
171 '/stable',
172 '/main',
173 '/master',
174 ]
175 for path in paths:
176 pattern_url = url._replace( path = url.path.rstrip( '/' ) + path )
177 patterns.append( pattern_url.geturl( ) )
178 return patterns
181def _produce_github_pages_patterns( url: _Url ) -> list[ str ]:
182 ''' Produces GitHub Pages specific URL patterns. '''
183 patterns = _produce_generic_patterns( url )
184 if url.path and url.path != '/':
185 root_url = url._replace( path = '' )
186 patterns.insert( 1, root_url.geturl( ) )
187 return patterns
190def _produce_gitlab_pages_patterns( url: _Url ) -> list[ str ]:
191 ''' Produces GitLab Pages specific URL patterns. '''
192 return _produce_github_pages_patterns( url )
195def _produce_readthedocs_patterns( url: _Url ) -> list[ str ]:
196 ''' Produces ReadTheDocs specific URL patterns. '''
197 base_url = url.geturl( )
198 patterns = [ base_url ]
199 readthedocs_paths = [
200 '/en/latest',
201 '/en/stable',
202 '/latest',
203 '/stable',
204 '/main',
205 '/master',
206 ]
207 for path in readthedocs_paths:
208 pattern_url = url._replace( path = path )
209 patterns.append( pattern_url.geturl( ) )
210 return patterns