Coverage for sources/librovore/inventories/mkdocs/detection.py: 0%
100 statements
« prev ^ index » next coverage.py v7.10.5, created at 2025-08-29 01:14 +0000
« prev ^ index » next coverage.py v7.10.5, created at 2025-08-29 01:14 +0000
1# vim: set filetype=python fileencoding=utf-8:
2# -*- coding: utf-8 -*-
4#============================================================================#
5# #
6# Licensed under the Apache License, Version 2.0 (the "License"); #
7# you may not use this file except in compliance with the License. #
8# You may obtain a copy of the License at #
9# #
10# http://www.apache.org/licenses/LICENSE-2.0 #
11# #
12# Unless required by applicable law or agreed to in writing, software #
13# distributed under the License is distributed on an "AS IS" BASIS, #
14# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. #
15# See the License for the specific language governing permissions and #
16# limitations under the License. #
17# #
18#============================================================================#
21''' MkDocs inventory detection implementations. '''
24import json as _json
26from . import __
29_SEARCH_INDEX_PATHS = (
30 '/search/search_index.json',
31 '/search_index.json',
32 '/assets/search/search_index.json',
33)
35_MINIMUM_DOCUMENT_COUNT = 1
36_SUBSTANTIAL_DOCS_THRESHOLD = 10
37_MODERATE_DOCS_THRESHOLD = 5
38_CONTENT_PREVIEW_LENGTH = 200
41class MkDocsInventoryDetection( __.InventoryDetection ):
42 ''' Detection result for MkDocs search index inventory sources. '''
44 inventory_data: __.Absential[ dict[ str, __.typx.Any ] ] = __.absent
46 @classmethod
47 async def from_source(
48 selfclass,
49 auxdata: __.ApplicationGlobals,
50 processor: __.Processor,
51 source: str,
52 ) -> __.typx.Self:
53 ''' Constructs MkDocs inventory detection from source. '''
54 base_url = __.normalize_base_url( source )
55 inventory_data, confidence = await probe_search_index(
56 auxdata, base_url )
57 return selfclass(
58 processor = processor,
59 confidence = confidence,
60 inventory_data = inventory_data )
62 async def filter_inventory(
63 self,
64 auxdata: __.ApplicationGlobals,
65 source: str, /, *,
66 filters: __.cabc.Mapping[ str, __.typx.Any ],
67 details: __.InventoryQueryDetails = (
68 __.InventoryQueryDetails.Documentation ),
69 ) -> tuple[ __.InventoryObject, ... ]:
70 ''' Filters inventory objects from MkDocs search index. '''
71 if __.is_absent( self.inventory_data ):
72 base_url = __.normalize_base_url( source )
73 inventory_data, _ = await probe_search_index( auxdata, base_url )
74 if __.is_absent( inventory_data ): return tuple( )
75 else: inventory_data = self.inventory_data
76 objects = filter_inventory(
77 inventory_data, source, filters = filters, details = details )
78 return tuple( objects )
81def calculate_confidence(
82 docs: list[ __.typx.Any ], valid_docs: int
83) -> float:
84 ''' Calculates confidence score based on search index quality. '''
85 if valid_docs == 0: return 0.0
86 doc_ratio = valid_docs / len( docs ) if docs else 0.0
87 base_confidence = 0.6
88 if valid_docs >= _SUBSTANTIAL_DOCS_THRESHOLD: base_confidence = 0.8
89 elif valid_docs >= _MODERATE_DOCS_THRESHOLD: base_confidence = 0.7
90 return min( base_confidence * doc_ratio, 0.9 )
93def filter_inventory(
94 inventory_data: dict[ str, __.typx.Any ],
95 location_url: str, /, *,
96 filters: __.cabc.Mapping[ str, __.typx.Any ],
97 details: __.InventoryQueryDetails = (
98 __.InventoryQueryDetails.Documentation ),
99) -> list[ __.InventoryObject ]:
100 ''' Filters inventory objects from parsed search index data. '''
101 docs = inventory_data.get( 'docs', [ ] )
102 location_pattern = filters.get( 'location', '' ) or __.absent
103 title_pattern = filters.get( 'title', '' ) or __.absent
104 all_objects: list[ __.InventoryObject ] = [ ]
105 for doc in docs:
106 if not isinstance( doc, dict ): continue
107 typed_doc = __.typx.cast( dict[ str, __.typx.Any ], doc )
108 location = str( typed_doc.get( 'location', '' ) )
109 title = str( typed_doc.get( 'title', '' ) )
110 if not location or not title: continue
111 if (
112 not __.is_absent( location_pattern )
113 and location_pattern not in location
114 ): continue
115 if (
116 not __.is_absent( title_pattern )
117 and title_pattern not in title
118 ): continue
119 obj = format_inventory_object( typed_doc, location_url )
120 all_objects.append( obj )
121 return all_objects
124class MkDocsInventoryObject( __.InventoryObject ):
125 ''' MkDocs-specific inventory object with page-aware formatting. '''
127 def render_specifics_markdown(
128 self, /, *,
129 reveal_internals: bool = True,
130 ) -> tuple[ str, ... ]:
131 ''' Renders MkDocs specifics with page information. '''
132 lines: list[ str ] = [ ]
133 role = self.specifics.get( 'role' )
134 if role:
135 lines.append( f"- **Type:** {role}" )
136 domain = self.specifics.get( 'domain' )
137 if domain:
138 lines.append( f"- **Domain:** {domain}" )
139 return tuple( lines )
141 def render_specifics_json(
142 self
143 ) -> __.immut.Dictionary[ str, __.typx.Any ]:
144 ''' Renders MkDocs specifics with page format information. '''
145 return __.immut.Dictionary(
146 role = self.specifics.get( 'role' ),
147 domain = self.specifics.get( 'domain' ),
148 object_type = self.specifics.get( 'object_type' ),
149 content_preview = self.specifics.get( 'content_preview' ),
150 )
153def format_inventory_object(
154 doc: dict[ str, __.typx.Any ],
155 location_url: str,
156) -> MkDocsInventoryObject:
157 ''' Formats MkDocs search index document with attribution. '''
158 location = str( doc.get( 'location', '' ) )
159 title = str( doc.get( 'title', '' ) )
160 text = str( doc.get( 'text', '' ) )
161 content_preview = (
162 text[ :_CONTENT_PREVIEW_LENGTH ] + '...'
163 if len( text ) > _CONTENT_PREVIEW_LENGTH else text )
164 return MkDocsInventoryObject(
165 name = title,
166 uri = location,
167 inventory_type = 'mkdocs_search_index',
168 location_url = location_url,
169 specifics = __.immut.Dictionary(
170 domain = 'page',
171 role = 'doc',
172 priority = '1',
173 object_type = 'page',
174 content_preview = content_preview ) )
177async def probe_search_index(
178 auxdata: __.ApplicationGlobals,
179 base_url: __.typx.Any,
180) -> tuple[ __.Absential[ dict[ str, __.typx.Any ] ], float ]:
181 ''' Probes for MkDocs search index files and validates structure. '''
182 for path in _SEARCH_INDEX_PATHS:
183 search_url = base_url._replace( path = base_url.path + path )
184 result = await _try_single_search_index( auxdata, search_url )
185 if not __.is_absent( result ): return result
186 return __.absent, 0.0
189def _count_valid_docs( docs: list[ __.typx.Any ] ) -> int:
190 ''' Counts valid document entries in search index. '''
191 valid_docs = 0
192 for doc in docs:
193 if not isinstance( doc, dict ): continue
194 if 'location' not in doc or 'title' not in doc: continue
195 valid_docs += 1
196 return valid_docs
199def _is_valid_search_index( data: __.typx.Any ) -> bool:
200 ''' Validates search index structure. '''
201 if not isinstance( data, dict ):
202 return False
203 typed_data = __.typx.cast( dict[ str, __.typx.Any ], data )
204 docs = typed_data.get( 'docs', [ ] )
205 if not isinstance( docs, list ):
206 return False
207 typed_docs = __.typx.cast( list[ __.typx.Any ], docs )
208 return len( typed_docs ) >= _MINIMUM_DOCUMENT_COUNT
211async def _try_single_search_index(
212 auxdata: __.ApplicationGlobals,
213 search_url: __.typx.Any,
214) -> __.Absential[ tuple[ dict[ str, __.typx.Any ], float ] ]:
215 ''' Attempts to load and validate a single search index URL. '''
216 search_index_raw = await __.retrieve_url_as_text(
217 auxdata.content_cache, search_url )
218 if __.is_absent( search_index_raw ):
219 return __.absent
220 try: inventory_data = _json.loads( search_index_raw )
221 except ( _json.JSONDecodeError, UnicodeDecodeError, Exception ):
222 return __.absent
223 if not _is_valid_search_index( inventory_data ):
224 return __.absent
225 docs = inventory_data[ 'docs' ]
226 valid_docs = _count_valid_docs( docs )
227 if valid_docs < _MINIMUM_DOCUMENT_COUNT:
228 return __.absent
229 confidence = calculate_confidence( docs, valid_docs )
230 return inventory_data, confidence