Coverage for sources/librovore/urlpatterns.py: 70%

97 statements  

« prev     ^ index     » next       coverage.py v7.10.6, created at 2025-09-06 02:25 +0000

1# vim: set filetype=python fileencoding=utf-8: 

2# -*- coding: utf-8 -*- 

3 

4#============================================================================# 

5# # 

6# Licensed under the Apache License, Version 2.0 (the "License"); # 

7# you may not use this file except in compliance with the License. # 

8# You may obtain a copy of the License at # 

9# # 

10# http://www.apache.org/licenses/LICENSE-2.0 # 

11# # 

12# Unless required by applicable law or agreed to in writing, software # 

13# distributed under the License is distributed on an "AS IS" BASIS, # 

14# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # 

15# See the License for the specific language governing permissions and # 

16# limitations under the License. # 

17# # 

18#============================================================================# 

19 

20 

21''' URL pattern analysis and extension utilities for documentation. ''' 

22 

23 

24from . import __ 

25from . import cacheproxy as _cacheproxy 

26from . import state as _state 

27 

28_Url = __.urlparse.ParseResult 

29 

30 

31_scribe = __.acquire_scribe( __name__ ) 

32 

33 

34UrlPatternResult: __.typx.TypeAlias = __.immut.Dictionary[ str, __.typx.Any ] 

35 

36 

37class UrlPatternAnalysis( __.immut.DataclassObject ): 

38 ''' Analysis of documentation site URL patterns. ''' 

39 

40 base_url: _Url 

41 site_type: str 

42 candidate_patterns: tuple[ str, ... ] 

43 has_version_segment: bool 

44 

45 @classmethod 

46 def from_url( cls, url: _Url ) -> __.typx.Self: 

47 ''' Analyzes URL to determine documentation site pattern. ''' 

48 site_type = detect_documentation_site_type( url ) 

49 candidate_patterns = produce_url_patterns( url, site_type ) 

50 has_version_segment = detect_version_segment( url ) 

51 return cls( 

52 base_url = url, 

53 site_type = site_type, 

54 candidate_patterns = candidate_patterns, 

55 has_version_segment = has_version_segment, 

56 ) 

57 

58 

59def detect_documentation_site_type( url: _Url ) -> str: 

60 ''' Detects documentation hosting platform from URL. ''' 

61 netloc = url.netloc.lower( ) 

62 if 'readthedocs.io' in netloc or 'readthedocs.org' in netloc: 62 ↛ 63line 62 didn't jump to line 63 because the condition on line 62 was never true

63 return 'readthedocs' 

64 if 'github.io' in netloc: 64 ↛ 65line 64 didn't jump to line 65 because the condition on line 64 was never true

65 return 'github_pages' 

66 if 'gitlab.io' in netloc: 66 ↛ 67line 66 didn't jump to line 67 because the condition on line 66 was never true

67 return 'gitlab_pages' 

68 if 'netlify.app' in netloc or 'netlify.com' in netloc: 68 ↛ 69line 68 didn't jump to line 69 because the condition on line 68 was never true

69 return 'netlify' 

70 if 'vercel.app' in netloc: 70 ↛ 71line 70 didn't jump to line 71 because the condition on line 70 was never true

71 return 'vercel' 

72 return 'generic' 

73 

74 

75def detect_version_segment( url: _Url ) -> bool: 

76 ''' Detects if URL contains version-like path segments. ''' 

77 path_segments = [ 

78 segment for segment in url.path.split( '/' ) if segment 

79 ] 

80 return any( _is_version_like( segment ) for segment in path_segments ) 

81 

82 

83def normalize_url_for_patterns( url: _Url ) -> _Url: 

84 ''' Normalizes URL by removing trailing slashes and fragments. ''' 

85 path = url.path.rstrip( '/' ) if url.path != '/' else '' 

86 return url._replace( path = path, fragment = '', query = '' ) 

87 

88 

89def produce_url_patterns( url: _Url, site_type: str ) -> tuple[ str, ... ]: 

90 ''' Produces candidate URL patterns for documentation discovery. ''' 

91 normalized_url = normalize_url_for_patterns( url ) 

92 patterns: list[ str ] = [ ] 

93 match site_type: 

94 case 'readthedocs': 94 ↛ 95line 94 didn't jump to line 95 because the pattern on line 94 never matched

95 patterns.extend( _produce_readthedocs_patterns( normalized_url ) ) 

96 case 'github_pages': 96 ↛ 97line 96 didn't jump to line 97 because the pattern on line 96 never matched

97 patterns.extend( _produce_github_pages_patterns( normalized_url ) ) 

98 case 'gitlab_pages': 98 ↛ 99line 98 didn't jump to line 99 because the pattern on line 98 never matched

99 patterns.extend( _produce_gitlab_pages_patterns( normalized_url ) ) 

100 case _: 

101 patterns.extend( _produce_generic_patterns( normalized_url ) ) 

102 seen: set[ str ] = set( ) 

103 deduplicated: list[ str ] = [ ] 

104 for pattern in patterns: 

105 if pattern not in seen: 105 ↛ 104line 105 didn't jump to line 104 because the condition on line 105 was always true

106 seen.add( pattern ) 

107 deduplicated.append( pattern ) 

108 return tuple( deduplicated ) 

109 

110 

111async def probe_url_patterns( 

112 auxdata: _state.Globals, 

113 base_url: _Url, 

114 inventory_path: str 

115) -> __.Absential[ _Url ]: 

116 ''' Probes URL patterns to find working inventory URL. ''' 

117 analysis = UrlPatternAnalysis.from_url( base_url ) 

118 tasks: list[ __.cabc.Awaitable[ bool ] ] = [ 

119 _cacheproxy.probe_url( 

120 auxdata.probe_cache, 

121 __.urlparse.urlparse( __.urlparse.urlunparse( ( 

122 candidate_url.scheme, 

123 candidate_url.netloc, 

124 candidate_url.path + inventory_path, 

125 candidate_url.params, 

126 candidate_url.query, 

127 candidate_url.fragment 

128 ) ) ) ) 

129 for pattern in analysis.candidate_patterns 

130 for candidate_url in [ __.urlparse.urlparse( pattern ) ] 

131 ] 

132 results = await __.asyncf.gather_async( 

133 *tasks, return_exceptions = True ) 

134 for i, result in enumerate( results ): 

135 if __.generics.is_value( result ) and result.value: 135 ↛ 136line 135 didn't jump to line 136 because the condition on line 135 was never true

136 pattern = analysis.candidate_patterns[ i ] 

137 return __.urlparse.urlparse( pattern ) 

138 return __.absent 

139 

140 

141def _is_version_like( segment: str ) -> bool: 

142 ''' Checks if path segment looks like a version identifier. ''' 

143 version_patterns = [ 

144 'latest', 'stable', 'main', 'master', 'dev', 'development', 

145 'v1', 'v2', 'v3', 'v4', 'v5', 

146 'en', 'docs', 

147 ] 

148 segment_lower = segment.lower( ) 

149 if segment_lower in version_patterns: 149 ↛ 150line 149 didn't jump to line 150 because the condition on line 149 was never true

150 return True 

151 return bool( _matches_version_pattern( segment ) ) 

152 

153 

154def _matches_version_pattern( segment: str ) -> bool: 

155 ''' Checks if segment matches common version patterns. ''' 

156 version_regex = __.re.compile( 

157 r'^v?\d+(\.\d+)*([a-z]\d*)?$', __.re.IGNORECASE ) 

158 return bool( version_regex.match( segment ) ) 

159 

160 

161def _produce_generic_patterns( url: _Url ) -> list[ str ]: 

162 ''' Produces generic documentation URL patterns. ''' 

163 base_url = url.geturl( ) 

164 patterns = [ base_url ] 

165 paths = [ 

166 '/en/latest', 

167 '/latest', 

168 '/docs', 

169 '/documentation', 

170 '/en/stable', 

171 '/stable', 

172 '/main', 

173 '/master', 

174 ] 

175 for path in paths: 

176 pattern_url = url._replace( path = url.path.rstrip( '/' ) + path ) 

177 patterns.append( pattern_url.geturl( ) ) 

178 return patterns 

179 

180 

181def _produce_github_pages_patterns( url: _Url ) -> list[ str ]: 

182 ''' Produces GitHub Pages specific URL patterns. ''' 

183 patterns = _produce_generic_patterns( url ) 

184 if url.path and url.path != '/': 

185 root_url = url._replace( path = '' ) 

186 patterns.insert( 1, root_url.geturl( ) ) 

187 return patterns 

188 

189 

190def _produce_gitlab_pages_patterns( url: _Url ) -> list[ str ]: 

191 ''' Produces GitLab Pages specific URL patterns. ''' 

192 return _produce_github_pages_patterns( url ) 

193 

194 

195def _produce_readthedocs_patterns( url: _Url ) -> list[ str ]: 

196 ''' Produces ReadTheDocs specific URL patterns. ''' 

197 base_url = url.geturl( ) 

198 patterns = [ base_url ] 

199 readthedocs_paths = [ 

200 '/en/latest', 

201 '/en/stable', 

202 '/latest', 

203 '/stable', 

204 '/main', 

205 '/master', 

206 ] 

207 for path in readthedocs_paths: 

208 pattern_url = url._replace( path = path ) 

209 patterns.append( pattern_url.geturl( ) ) 

210 return patterns