Coverage for sources/librovore/inventories/mkdocs/detection.py: 0%

100 statements  

« prev     ^ index     » next       coverage.py v7.10.6, created at 2025-09-03 21:59 +0000

1# vim: set filetype=python fileencoding=utf-8: 

2# -*- coding: utf-8 -*- 

3 

4#============================================================================# 

5# # 

6# Licensed under the Apache License, Version 2.0 (the "License"); # 

7# you may not use this file except in compliance with the License. # 

8# You may obtain a copy of the License at # 

9# # 

10# http://www.apache.org/licenses/LICENSE-2.0 # 

11# # 

12# Unless required by applicable law or agreed to in writing, software # 

13# distributed under the License is distributed on an "AS IS" BASIS, # 

14# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # 

15# See the License for the specific language governing permissions and # 

16# limitations under the License. # 

17# # 

18#============================================================================# 

19 

20 

21''' MkDocs inventory detection implementations. ''' 

22 

23 

24import json as _json 

25 

26from . import __ 

27 

28 

29_SEARCH_INDEX_PATHS = ( 

30 '/search/search_index.json', 

31 '/search_index.json', 

32 '/assets/search/search_index.json', 

33) 

34 

35_MINIMUM_DOCUMENT_COUNT = 1 

36_SUBSTANTIAL_DOCS_THRESHOLD = 10 

37_MODERATE_DOCS_THRESHOLD = 5 

38_CONTENT_PREVIEW_LENGTH = 200 

39 

40 

41class MkDocsInventoryDetection( __.InventoryDetection ): 

42 ''' Detection result for MkDocs search index inventory sources. ''' 

43 

44 inventory_data: __.Absential[ dict[ str, __.typx.Any ] ] = __.absent 

45 

46 @classmethod 

47 async def from_source( 

48 selfclass, 

49 auxdata: __.ApplicationGlobals, 

50 processor: __.Processor, 

51 source: str, 

52 ) -> __.typx.Self: 

53 ''' Constructs MkDocs inventory detection from source. ''' 

54 base_url = __.normalize_base_url( source ) 

55 inventory_data, confidence = await probe_search_index( 

56 auxdata, base_url ) 

57 return selfclass( 

58 processor = processor, 

59 confidence = confidence, 

60 inventory_data = inventory_data ) 

61 

62 async def filter_inventory( 

63 self, 

64 auxdata: __.ApplicationGlobals, 

65 source: str, /, *, 

66 filters: __.cabc.Mapping[ str, __.typx.Any ], 

67 details: __.InventoryQueryDetails = ( 

68 __.InventoryQueryDetails.Documentation ), 

69 ) -> tuple[ __.InventoryObject, ... ]: 

70 ''' Filters inventory objects from MkDocs search index. ''' 

71 if __.is_absent( self.inventory_data ): 

72 base_url = __.normalize_base_url( source ) 

73 inventory_data, _ = await probe_search_index( auxdata, base_url ) 

74 if __.is_absent( inventory_data ): return tuple( ) 

75 else: inventory_data = self.inventory_data 

76 objects = filter_inventory( 

77 inventory_data, source, filters = filters, details = details ) 

78 return tuple( objects ) 

79 

80 

81def calculate_confidence( 

82 docs: list[ __.typx.Any ], valid_docs: int 

83) -> float: 

84 ''' Calculates confidence score based on search index quality. ''' 

85 if valid_docs == 0: return 0.0 

86 doc_ratio = valid_docs / len( docs ) if docs else 0.0 

87 base_confidence = 0.6 

88 if valid_docs >= _SUBSTANTIAL_DOCS_THRESHOLD: base_confidence = 0.8 

89 elif valid_docs >= _MODERATE_DOCS_THRESHOLD: base_confidence = 0.7 

90 return min( base_confidence * doc_ratio, 0.9 ) 

91 

92 

93def filter_inventory( 

94 inventory_data: dict[ str, __.typx.Any ], 

95 location_url: str, /, *, 

96 filters: __.cabc.Mapping[ str, __.typx.Any ], 

97 details: __.InventoryQueryDetails = ( 

98 __.InventoryQueryDetails.Documentation ), 

99) -> list[ __.InventoryObject ]: 

100 ''' Filters inventory objects from parsed search index data. ''' 

101 docs = inventory_data.get( 'docs', [ ] ) 

102 location_pattern = filters.get( 'location', '' ) or __.absent 

103 title_pattern = filters.get( 'title', '' ) or __.absent 

104 all_objects: list[ __.InventoryObject ] = [ ] 

105 for doc in docs: 

106 if not isinstance( doc, dict ): continue 

107 typed_doc = __.typx.cast( dict[ str, __.typx.Any ], doc ) 

108 location = str( typed_doc.get( 'location', '' ) ) 

109 title = str( typed_doc.get( 'title', '' ) ) 

110 if not location or not title: continue 

111 if ( 

112 not __.is_absent( location_pattern ) 

113 and location_pattern not in location 

114 ): continue 

115 if ( 

116 not __.is_absent( title_pattern ) 

117 and title_pattern not in title 

118 ): continue 

119 obj = format_inventory_object( typed_doc, location_url ) 

120 all_objects.append( obj ) 

121 return all_objects 

122 

123 

124class MkDocsInventoryObject( __.InventoryObject ): 

125 ''' MkDocs-specific inventory object with page-aware formatting. ''' 

126 

127 def render_specifics_markdown( 

128 self, /, *, 

129 reveal_internals: bool = True, 

130 ) -> tuple[ str, ... ]: 

131 ''' Renders MkDocs specifics with page information. ''' 

132 lines: list[ str ] = [ ] 

133 role = self.specifics.get( 'role' ) 

134 if role: 

135 lines.append( f"- **Type:** {role}" ) 

136 domain = self.specifics.get( 'domain' ) 

137 if domain: 

138 lines.append( f"- **Domain:** {domain}" ) 

139 return tuple( lines ) 

140 

141 def render_specifics_json( 

142 self 

143 ) -> __.immut.Dictionary[ str, __.typx.Any ]: 

144 ''' Renders MkDocs specifics with page format information. ''' 

145 return __.immut.Dictionary( 

146 role = self.specifics.get( 'role' ), 

147 domain = self.specifics.get( 'domain' ), 

148 object_type = self.specifics.get( 'object_type' ), 

149 content_preview = self.specifics.get( 'content_preview' ), 

150 ) 

151 

152 

153def format_inventory_object( 

154 doc: dict[ str, __.typx.Any ], 

155 location_url: str, 

156) -> MkDocsInventoryObject: 

157 ''' Formats MkDocs search index document with attribution. ''' 

158 location = str( doc.get( 'location', '' ) ) 

159 title = str( doc.get( 'title', '' ) ) 

160 text = str( doc.get( 'text', '' ) ) 

161 content_preview = ( 

162 text[ :_CONTENT_PREVIEW_LENGTH ] + '...' 

163 if len( text ) > _CONTENT_PREVIEW_LENGTH else text ) 

164 return MkDocsInventoryObject( 

165 name = title, 

166 uri = location, 

167 inventory_type = 'mkdocs_search_index', 

168 location_url = location_url, 

169 specifics = __.immut.Dictionary( 

170 domain = 'page', 

171 role = 'doc', 

172 priority = '1', 

173 object_type = 'page', 

174 content_preview = content_preview ) ) 

175 

176 

177async def probe_search_index( 

178 auxdata: __.ApplicationGlobals, 

179 base_url: __.typx.Any, 

180) -> tuple[ __.Absential[ dict[ str, __.typx.Any ] ], float ]: 

181 ''' Probes for MkDocs search index files and validates structure. ''' 

182 for path in _SEARCH_INDEX_PATHS: 

183 search_url = base_url._replace( path = base_url.path + path ) 

184 result = await _try_single_search_index( auxdata, search_url ) 

185 if not __.is_absent( result ): return result 

186 return __.absent, 0.0 

187 

188 

189def _count_valid_docs( docs: list[ __.typx.Any ] ) -> int: 

190 ''' Counts valid document entries in search index. ''' 

191 valid_docs = 0 

192 for doc in docs: 

193 if not isinstance( doc, dict ): continue 

194 if 'location' not in doc or 'title' not in doc: continue 

195 valid_docs += 1 

196 return valid_docs 

197 

198 

199def _is_valid_search_index( data: __.typx.Any ) -> bool: 

200 ''' Validates search index structure. ''' 

201 if not isinstance( data, dict ): 

202 return False 

203 typed_data = __.typx.cast( dict[ str, __.typx.Any ], data ) 

204 docs = typed_data.get( 'docs', [ ] ) 

205 if not isinstance( docs, list ): 

206 return False 

207 typed_docs = __.typx.cast( list[ __.typx.Any ], docs ) 

208 return len( typed_docs ) >= _MINIMUM_DOCUMENT_COUNT 

209 

210 

211async def _try_single_search_index( 

212 auxdata: __.ApplicationGlobals, 

213 search_url: __.typx.Any, 

214) -> __.Absential[ tuple[ dict[ str, __.typx.Any ], float ] ]: 

215 ''' Attempts to load and validate a single search index URL. ''' 

216 search_index_raw = await __.retrieve_url_as_text( 

217 auxdata.content_cache, search_url ) 

218 if __.is_absent( search_index_raw ): 

219 return __.absent 

220 try: inventory_data = _json.loads( search_index_raw ) 

221 except ( _json.JSONDecodeError, UnicodeDecodeError, Exception ): 

222 return __.absent 

223 if not _is_valid_search_index( inventory_data ): 

224 return __.absent 

225 docs = inventory_data[ 'docs' ] 

226 valid_docs = _count_valid_docs( docs ) 

227 if valid_docs < _MINIMUM_DOCUMENT_COUNT: 

228 return __.absent 

229 confidence = calculate_confidence( docs, valid_docs ) 

230 return inventory_data, confidence