Coverage for sources/librovore/urlpatterns.py: 69%

1# vim: set filetype=python fileencoding=utf-8:

2# -*- coding: utf-8 -*-

4#============================================================================#

5# #

6# Licensed under the Apache License, Version 2.0 (the "License"); #

7# you may not use this file except in compliance with the License. #

8# You may obtain a copy of the License at #

9# #

10# http://www.apache.org/licenses/LICENSE-2.0 #

11# #

12# Unless required by applicable law or agreed to in writing, software #

13# distributed under the License is distributed on an "AS IS" BASIS, #

14# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. #

15# See the License for the specific language governing permissions and #

16# limitations under the License. #

17# #

18#============================================================================#

21''' URL pattern analysis and extension utilities for documentation. '''

24from . import __

25from . import cacheproxy as _cacheproxy

26from . import state as _state

28_Url = __.urlparse.ParseResult

31_scribe = __.acquire_scribe( __name__ )

34UrlPatternResult: __.typx.TypeAlias = __.immut.Dictionary[ str, __.typx.Any ]

37class UrlPatternAnalysis( __.immut.DataclassObject ):

38 ''' Analysis of documentation site URL patterns. '''

40 base_url: _Url

41 site_type: str

42 candidate_patterns: tuple[ str, ... ]

43 has_version_segment: bool

45 @classmethod

46 def from_url( cls, url: _Url ) -> __.typx.Self:

47 ''' Analyzes URL to determine documentation site pattern. '''

48 site_type = detect_documentation_site_type( url )

49 candidate_patterns = produce_url_patterns( url, site_type )

50 has_version_segment = detect_version_segment( url )

51 return cls(

52 base_url = url,

53 site_type = site_type,

54 candidate_patterns = candidate_patterns,

55 has_version_segment = has_version_segment,

56 )

59def detect_documentation_site_type( url: _Url ) -> str:

60 ''' Detects documentation hosting platform from URL. '''

61 netloc = url.netloc.lower( )

62 if 'readthedocs.io' in netloc or 'readthedocs.org' in netloc: 62 ↛ 63line 62 didn't jump to line 63 because the condition on line 62 was never true

63 return 'readthedocs'

64 if 'github.io' in netloc: 64 ↛ 65line 64 didn't jump to line 65 because the condition on line 64 was never true

65 return 'github_pages'

66 if 'gitlab.io' in netloc: 66 ↛ 67line 66 didn't jump to line 67 because the condition on line 66 was never true

67 return 'gitlab_pages'

68 if 'netlify.app' in netloc or 'netlify.com' in netloc: 68 ↛ 69line 68 didn't jump to line 69 because the condition on line 68 was never true

69 return 'netlify'

70 if 'vercel.app' in netloc: 70 ↛ 71line 70 didn't jump to line 71 because the condition on line 70 was never true

71 return 'vercel'

72 return 'generic'

75def detect_version_segment( url: _Url ) -> bool:

76 ''' Detects if URL contains version-like path segments. '''

77 path_segments = [

78 segment for segment in url.path.split( '/' ) if segment

79 ]

80 return any( _is_version_like( segment ) for segment in path_segments )

83def normalize_url_for_patterns( url: _Url ) -> _Url:

84 ''' Normalizes URL by removing trailing slashes and fragments. '''

85 path = url.path.rstrip( '/' ) if url.path != '/' else ''

86 return url._replace( path = path, fragment = '', query = '' )

89def produce_url_patterns( url: _Url, site_type: str ) -> tuple[ str, ... ]:

90 ''' Produces candidate URL patterns for documentation discovery. '''

91 normalized_url = normalize_url_for_patterns( url )

92 patterns: list[ str ] = [ ]

93 match site_type:

94 case 'readthedocs': 94 ↛ 95line 94 didn't jump to line 95 because the pattern on line 94 never matched

95 patterns.extend( _produce_readthedocs_patterns( normalized_url ) )

96 case 'github_pages': 96 ↛ 97line 96 didn't jump to line 97 because the pattern on line 96 never matched

97 patterns.extend( _produce_github_pages_patterns( normalized_url ) )

98 case 'gitlab_pages': 98 ↛ 99line 98 didn't jump to line 99 because the pattern on line 98 never matched

99 patterns.extend( _produce_gitlab_pages_patterns( normalized_url ) )

100 case _:

101 patterns.extend( _produce_generic_patterns( normalized_url ) )

102 seen: set[ str ] = set( )

103 deduplicated: list[ str ] = [ ]

104 for pattern in patterns:

105 if pattern not in seen: 105 ↛ 104line 105 didn't jump to line 104 because the condition on line 105 was always true

106 seen.add( pattern )

107 deduplicated.append( pattern )

108 return tuple( deduplicated )

109

110

111async def probe_url_patterns(

112 auxdata: _state.Globals,

113 base_url: _Url,

114 inventory_path: str = '/objects.inv'

115) -> __.Absential[ _Url ]:

116 ''' Probes URL patterns to find working inventory URL. '''

117 analysis = UrlPatternAnalysis.from_url( base_url )

118 for pattern in analysis.candidate_patterns:

119 candidate_url = __.urlparse.urlparse( pattern )

120 inventory_url = candidate_url._replace(

121 path = candidate_url.path + inventory_path

122 )

123 try: exists = await _cacheproxy.probe_url(

124 auxdata.probe_cache, inventory_url )

125 except Exception as exc:

126 _scribe.debug( "Pattern probe failed for %s: %s",

127 inventory_url.geturl( ), exc )

128 continue

129 else:

130 if exists: return candidate_url 130 ↛ exitline 130 didn't return from function 'probe_url_patterns' because the return on line 130 wasn't executed

131 return __.absent

132

133

134def _is_version_like( segment: str ) -> bool:

135 ''' Checks if path segment looks like a version identifier. '''

136 version_patterns = [

137 'latest', 'stable', 'main', 'master', 'dev', 'development',

138 'v1', 'v2', 'v3', 'v4', 'v5',

139 'en', 'docs',

140 ]

141 segment_lower = segment.lower( )

142 if segment_lower in version_patterns: 142 ↛ 143line 142 didn't jump to line 143 because the condition on line 142 was never true

143 return True

144 return bool( _matches_version_pattern( segment ) )

145

146

147def _matches_version_pattern( segment: str ) -> bool:

148 ''' Checks if segment matches common version patterns. '''

149 version_regex = __.re.compile(

150 r'^v?\d+(\.\d+)*([a-z]\d*)?$', __.re.IGNORECASE )

151 return bool( version_regex.match( segment ) )

152

153

154def _produce_generic_patterns( url: _Url ) -> list[ str ]:

155 ''' Produces generic documentation URL patterns. '''

156 base_url = url.geturl( )

157 patterns = [ base_url ]

158 paths = [

159 '/en/latest',

160 '/latest',

161 '/docs',

162 '/documentation',

163 '/en/stable',

164 '/stable',

165 '/main',

166 '/master',

167 ]

168 for path in paths:

169 pattern_url = url._replace( path = url.path.rstrip( '/' ) + path )

170 patterns.append( pattern_url.geturl( ) )

171 return patterns

172

173

174def _produce_github_pages_patterns( url: _Url ) -> list[ str ]:

175 ''' Produces GitHub Pages specific URL patterns. '''

176 patterns = _produce_generic_patterns( url )

177 if url.path and url.path != '/':

178 root_url = url._replace( path = '' )

179 patterns.insert( 1, root_url.geturl( ) )

180 return patterns

181

182

183def _produce_gitlab_pages_patterns( url: _Url ) -> list[ str ]:

184 ''' Produces GitLab Pages specific URL patterns. '''

185 return _produce_github_pages_patterns( url )

186

187

188def _produce_readthedocs_patterns( url: _Url ) -> list[ str ]:

189 ''' Produces ReadTheDocs specific URL patterns. '''

190 base_url = url.geturl( )

191 patterns = [ base_url ]

192 readthedocs_paths = [

193 '/en/latest',

194 '/en/stable',

195 '/latest',

196 '/stable',

197 '/main',

198 '/master',

199 ]

200 for path in readthedocs_paths:

201 pattern_url = url._replace( path = path )

202 patterns.append( pattern_url.geturl( ) )

203 return patterns