Coverage for sources/detextive/detectors.py: 100%
122 statements
« prev ^ index » next coverage.py v7.10.6, created at 2025-09-20 18:02 +0000
« prev ^ index » next coverage.py v7.10.6, created at 2025-09-20 18:02 +0000
1# vim: set filetype=python fileencoding=utf-8:
2# -*- coding: utf-8 -*-
4#============================================================================#
5# #
6# Licensed under the Apache License, Version 2.0 (the "License"); #
7# you may not use this file except in compliance with the License. #
8# You may obtain a copy of the License at #
9# #
10# http://www.apache.org/licenses/LICENSE-2.0 #
11# #
12# Unless required by applicable law or agreed to in writing, software #
13# distributed under the License is distributed on an "AS IS" BASIS, #
14# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. #
15# See the License for the specific language governing permissions and #
16# limitations under the License. #
17# #
18#============================================================================#
21''' Core detection function implementations. '''
24from . import __
25from . import charsets as _charsets
26from . import core as _core
27from . import exceptions as _exceptions
28from . import mimetypes as _mimetypes
29from . import nomina as _nomina
30from . import validation as _validation
32from .core import ( # isort: skip
33 BEHAVIORS_DEFAULT as _BEHAVIORS_DEFAULT,
34 CHARSET_DEFAULT as _CHARSET_DEFAULT,
35 MIMETYPE_DEFAULT as _MIMETYPE_DEFAULT,
36 BehaviorTristate as _BehaviorTristate,
37 Behaviors as _Behaviors,
38 BehaviorsArgument as _BehaviorsArgument,
39 CharsetResult as _CharsetResult,
40 DetectFailureActions as _DetectFailureActions,
41 MimetypeResult as _MimetypeResult,
42)
45CharsetDetector: __.typx.TypeAlias = __.typx.Annotated[
46 __.cabc.Callable[
47 [ _nomina.Content, _Behaviors ],
48 _CharsetResult | __.types.NotImplementedType
49 ],
50 __.ddoc.Doc(
51 ''' Character set detector function.
53 Takes bytes content and behaviors object.
55 Returns either a detection result or ``NotImplemented``. The
56 detection result will include the name of the character set, which
57 has been determined as able to decode the content, or ``None``, if
58 it believes that no character set is applicable to the content, and
59 the confidence of the detection.
60 ''' ),
61]
62MimetypeDetector: __.typx.TypeAlias = __.typx.Annotated[
63 __.cabc.Callable[
64 [ _nomina.Content, _Behaviors ],
65 _MimetypeResult | __.types.NotImplementedType,
66 ],
67 __.ddoc.Doc(
68 ''' MIME type detector function.
70 Takes bytes content and behaviors object.
72 Returns either a detection result or ``NotImplemented``. The
73 detection result will include the MIME type and the confidence of
74 the detection.
75 ''' ),
76]
79charset_detectors: __.typx.Annotated[
80 __.accret.Dictionary[ str, CharsetDetector ],
81 __.ddoc.Doc( ''' Registry for character set detectors. ''' ),
82] = __.accret.Dictionary( )
83mimetype_detectors: __.typx.Annotated[
84 __.accret.Dictionary[ str, MimetypeDetector ],
85 __.ddoc.Doc( ''' Registry for MIME type detectors. ''' ),
86] = __.accret.Dictionary( )
89def detect_charset( # noqa: PLR0913
90 content: _nomina.Content, /, *,
91 behaviors: _BehaviorsArgument = _BEHAVIORS_DEFAULT,
92 default: _nomina.CharsetDefaultArgument = _CHARSET_DEFAULT,
93 supplement: _nomina.CharsetSupplementArgument = __.absent,
94 mimetype: _nomina.MimetypeAssumptionArgument = __.absent,
95 location: _nomina.LocationArgument = __.absent,
96) -> __.typx.Optional[ str ]:
97 ''' Detects character set. '''
98 result = detect_charset_confidence(
99 content,
100 behaviors = behaviors,
101 default = default,
102 supplement = supplement,
103 mimetype = mimetype,
104 location = location )
105 return result.charset
108def detect_charset_confidence( # noqa: PLR0913
109 content: _nomina.Content, /, *,
110 behaviors: _BehaviorsArgument = _BEHAVIORS_DEFAULT,
111 default: _nomina.CharsetDefaultArgument = _CHARSET_DEFAULT,
112 supplement: _nomina.CharsetSupplementArgument = __.absent,
113 mimetype: _nomina.MimetypeAssumptionArgument = __.absent,
114 location: _nomina.LocationArgument = __.absent,
115) -> _CharsetResult:
116 ''' Detects character set candidates with confidence scores. '''
117 if b'' == content:
118 return _CharsetResult( charset = 'utf-8', confidence = 1.0 )
119 for name in behaviors.charset_detectors_order:
120 detector = charset_detectors.get( name )
121 if detector is None: continue
122 result = detector( content, behaviors )
123 if result is NotImplemented: continue
124 break
125 else:
126 match behaviors.charset_on_detect_failure:
127 case _DetectFailureActions.Default:
128 return _CharsetResult( charset = default, confidence = 0.0 )
129 case _:
130 raise _exceptions.CharsetDetectFailure( location = location )
131 if result.charset is None:
132 if __.is_absent( mimetype ): return result
133 if not _mimetypes.is_textual_mimetype( mimetype ): return result
134 result = _charsets.trial_decode_as_confident(
135 content,
136 behaviors = behaviors,
137 supplement = supplement,
138 location = location )
139 return _normalize_charset_detection( content, behaviors, result )
140 return _confirm_charset_detection(
141 content, behaviors, result,
142 supplement = supplement, location = location )
145def detect_mimetype(
146 content: _nomina.Content, /, *,
147 behaviors: _BehaviorsArgument = _BEHAVIORS_DEFAULT,
148 default: _nomina.MimetypeDefaultArgument = _MIMETYPE_DEFAULT,
149 charset: _nomina.CharsetAssumptionArgument = __.absent,
150 location: _nomina.LocationArgument = __.absent,
151) -> str:
152 ''' Detects most probable MIME type. '''
153 nomargs: __.NominativeArguments = dict(
154 behaviors = behaviors,
155 default = default,
156 charset = charset,
157 location = location )
158 result = detect_mimetype_confidence( content, **nomargs )
159 return result.mimetype
162def detect_mimetype_confidence(
163 content: _nomina.Content, /, *,
164 behaviors: _BehaviorsArgument = _BEHAVIORS_DEFAULT,
165 default: _nomina.MimetypeDefaultArgument = _MIMETYPE_DEFAULT,
166 charset: _nomina.CharsetAssumptionArgument = __.absent,
167 location: _nomina.LocationArgument = __.absent,
168) -> _MimetypeResult:
169 ''' Detects MIME type candidates with confidence scores. '''
170 if b'' == content:
171 return _MimetypeResult( mimetype = 'text/plain', confidence = 1.0 )
172 for name in behaviors.mimetype_detectors_order:
173 detector = mimetype_detectors.get( name )
174 if detector is None: continue
175 result = detector( content, behaviors )
176 if result is NotImplemented: continue
177 return result
178 if __.is_absent( charset ):
179 match behaviors.mimetype_on_detect_failure:
180 case _DetectFailureActions.Default:
181 return _MimetypeResult( mimetype = default, confidence = 0.0 )
182 case _:
183 raise _exceptions.MimetypeDetectFailure( location = location )
184 return _detect_mimetype_from_charset(
185 content, behaviors, charset, default = default, location = location )
188def _confirm_charset_detection( # noqa: PLR0911
189 content: _nomina.Content,
190 behaviors: _Behaviors,
191 result: _CharsetResult, /, *,
192 supplement: __.Absential[ str ] = __.absent,
193 location: __.Absential[ _nomina.Location ] = __.absent,
194) -> _CharsetResult:
195 result = _normalize_charset_detection( content, behaviors, result )
196 if result.charset is None: return result # pragma: no cover
197 charset, confidence = result.charset, result.confidence
198 charset = behaviors.charset_promotions.get( charset, charset )
199 if charset.startswith( 'utf-' ):
200 result = _charsets.trial_decode_as_confident(
201 content,
202 behaviors = behaviors,
203 supplement = supplement,
204 inference = charset,
205 confidence = confidence,
206 location = location )
207 return _normalize_charset_detection( content, behaviors, result )
208 result = _CharsetResult( charset = charset, confidence = confidence )
209 match behaviors.trial_decode:
210 case _BehaviorTristate.Never: return result
211 case _: # Shake out false positives, like 'MacRoman'.
212 if charset == _charsets.discover_os_charset_default( ):
213 # Allow 'windows-1252', etc..., as appropriate.
214 return result # pragma: no cover
215 try:
216 _, result_ = _charsets.attempt_decodes(
217 content,
218 behaviors = behaviors,
219 inference = 'utf-8-sig',
220 supplement = supplement,
221 location = location )
222 except _exceptions.ContentDecodeFailure: return result
223 if charset == result_.charset: return result # pragma: no cover
224 return _normalize_charset_detection( content, behaviors, result_ )
227def _detect_mimetype_from_charset(
228 content: _nomina.Content,
229 behaviors: _Behaviors,
230 charset: str, /, *,
231 default: str,
232 location: __.Absential[ _nomina.Location ],
233) -> _MimetypeResult:
234 should_error = False
235 match behaviors.mimetype_on_detect_failure:
236 case _DetectFailureActions.Default: pass
237 case _: should_error = True
238 error = _exceptions.MimetypeDetectFailure( location = location )
239 result_default = _MimetypeResult( mimetype = default, confidence = 0.0 )
240 match behaviors.trial_decode:
241 case _BehaviorTristate.Never:
242 if should_error: raise error
243 return result_default
244 case _: pass
245 try:
246 text, charset_result = _charsets.attempt_decodes(
247 content,
248 behaviors = behaviors, inference = charset, location = location )
249 except _exceptions.ContentDecodeFailure:
250 if should_error: raise error from None
251 return result_default
252 match behaviors.text_validate:
253 case _BehaviorTristate.Never:
254 if should_error: raise error
255 return result_default
256 case _: pass
257 if not _validation.PROFILE_TEXTUAL( text ):
258 if should_error: raise error
259 return result_default
260 return _MimetypeResult(
261 mimetype = 'text/plain', confidence = charset_result.confidence )
264def _detect_via_chardet(
265 content: _nomina.Content, behaviors: _Behaviors
266) -> _CharsetResult | __.types.NotImplementedType:
267 try: import chardet # pragma: no cover
268 except ImportError: return NotImplemented # pragma: no cover
269 result_ = chardet.detect( content )
270 charset, confidence = result_[ 'encoding' ], result_[ 'confidence' ]
271 return _CharsetResult( charset = charset, confidence = confidence )
273charset_detectors[ 'chardet' ] = _detect_via_chardet
276def _detect_via_charset_normalizer(
277 content: _nomina.Content, behaviors: _Behaviors
278) -> _CharsetResult | __.types.NotImplementedType:
279 try: import charset_normalizer # pragma: no cover
280 except ImportError: return NotImplemented # pragma: no cover
281 result_ = charset_normalizer.from_bytes( content ).best( )
282 charset = None if result_ is None else result_.encoding # pragma: no cover
283 confidence = _core.confidence_from_bytes_quantity(
284 content, behaviors = behaviors )
285 return _CharsetResult( charset = charset, confidence = confidence )
287charset_detectors[ 'charset-normalizer' ] = _detect_via_charset_normalizer
290def _detect_via_magic(
291 content: _nomina.Content, behaviors: _Behaviors
292) -> _MimetypeResult | __.types.NotImplementedType:
293 try: import magic # pragma: no cover
294 except ImportError: return NotImplemented # pragma: no cover
295 try: mimetype = magic.from_buffer( content, mime = True )
296 except Exception: return NotImplemented # pragma: no cover
297 confidence = _core.confidence_from_bytes_quantity(
298 content, behaviors = behaviors )
299 return _MimetypeResult( mimetype = mimetype, confidence = confidence )
301mimetype_detectors[ 'magic' ] = _detect_via_magic
304def _detect_via_puremagic(
305 content: _nomina.Content, behaviors: _Behaviors
306) -> _MimetypeResult | __.types.NotImplementedType:
307 try: import puremagic # pragma: no cover
308 except ImportError: return NotImplemented # pragma: no cover
309 try: mimetype = puremagic.from_string( content, mime = True )
310 except ( puremagic.PureError, ValueError ): # pragma: no cover
311 return NotImplemented
312 confidence = _core.confidence_from_bytes_quantity(
313 content, behaviors = behaviors )
314 return _MimetypeResult( mimetype = mimetype, confidence = confidence )
316mimetype_detectors[ 'puremagic' ] = _detect_via_puremagic
319def _normalize_charset_detection(
320 content: _nomina.Content, behaviors: _Behaviors, result: _CharsetResult
321) -> _CharsetResult:
322 if result.charset is None: return result # pragma: no cover
323 charset = _charsets.normalize_charset( result.charset )
324 # TODO? Consider endianness variations for BOM.
325 if charset == 'utf-8-sig' and not content.startswith( __.codecs.BOM ):
326 charset = 'utf-8'
327 return _CharsetResult( charset = charset, confidence = result.confidence )