Coverage for sources / detextive / detectors.py: 100%
128 statements
« prev ^ index » next coverage.py v7.13.4, created at 2026-02-14 04:38 +0000
« prev ^ index » next coverage.py v7.13.4, created at 2026-02-14 04:38 +0000
1# vim: set filetype=python fileencoding=utf-8:
2# -*- coding: utf-8 -*-
4#============================================================================#
5# #
6# Licensed under the Apache License, Version 2.0 (the "License"); #
7# you may not use this file except in compliance with the License. #
8# You may obtain a copy of the License at #
9# #
10# http://www.apache.org/licenses/LICENSE-2.0 #
11# #
12# Unless required by applicable law or agreed to in writing, software #
13# distributed under the License is distributed on an "AS IS" BASIS, #
14# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. #
15# See the License for the specific language governing permissions and #
16# limitations under the License. #
17# #
18#============================================================================#
21''' Core detection function implementations. '''
24from . import __
25from . import charsets as _charsets
26from . import core as _core
27from . import exceptions as _exceptions
28from . import mimetypes as _mimetypes
29from . import nomina as _nomina
30from . import validation as _validation
32from .core import ( # isort: skip
33 BEHAVIORS_DEFAULT as _BEHAVIORS_DEFAULT,
34 CHARSET_DEFAULT as _CHARSET_DEFAULT,
35 MIMETYPE_DEFAULT as _MIMETYPE_DEFAULT,
36 BehaviorTristate as _BehaviorTristate,
37 Behaviors as _Behaviors,
38 BehaviorsArgument as _BehaviorsArgument,
39 CharsetResult as _CharsetResult,
40 CodecSpecifiers as _CodecSpecifiers,
41 DetectFailureActions as _DetectFailureActions,
42 MimetypeResult as _MimetypeResult,
43)
46CharsetDetector: __.typx.TypeAlias = __.typx.Annotated[
47 __.cabc.Callable[
48 [ _nomina.Content, _Behaviors ],
49 _CharsetResult | __.types.NotImplementedType
50 ],
51 __.ddoc.Doc(
52 ''' Character set detector function.
54 Takes bytes content and behaviors object.
56 Returns either a detection result or ``NotImplemented``. The
57 detection result will include the name of the character set, which
58 has been determined as able to decode the content, or ``None``, if
59 it believes that no character set is applicable to the content, and
60 the confidence of the detection.
61 ''' ),
62]
63MimetypeDetector: __.typx.TypeAlias = __.typx.Annotated[
64 __.cabc.Callable[
65 [ _nomina.Content, _Behaviors ],
66 _MimetypeResult | __.types.NotImplementedType,
67 ],
68 __.ddoc.Doc(
69 ''' MIME type detector function.
71 Takes bytes content and behaviors object.
73 Returns either a detection result or ``NotImplemented``. The
74 detection result will include the MIME type and the confidence of
75 the detection.
76 ''' ),
77]
80charset_detectors: __.typx.Annotated[
81 __.accret.Dictionary[ str, CharsetDetector ],
82 __.ddoc.Doc( ''' Registry for character set detectors. ''' ),
83] = __.accret.Dictionary( )
84mimetype_detectors: __.typx.Annotated[
85 __.accret.Dictionary[ str, MimetypeDetector ],
86 __.ddoc.Doc( ''' Registry for MIME type detectors. ''' ),
87] = __.accret.Dictionary( )
90def detect_charset( # noqa: PLR0913
91 content: _nomina.Content, /, *,
92 behaviors: _BehaviorsArgument = _BEHAVIORS_DEFAULT,
93 default: _nomina.CharsetDefaultArgument = _CHARSET_DEFAULT,
94 supplement: _nomina.CharsetSupplementArgument = __.absent,
95 mimetype: _nomina.MimetypeAssumptionArgument = __.absent,
96 location: _nomina.LocationArgument = __.absent,
97) -> __.typx.Optional[ str ]:
98 ''' Detects character set. '''
99 result = detect_charset_confidence(
100 content,
101 behaviors = behaviors,
102 default = default,
103 supplement = supplement,
104 mimetype = mimetype,
105 location = location )
106 return result.charset
109def detect_charset_confidence( # noqa: PLR0913
110 content: _nomina.Content, /, *,
111 behaviors: _BehaviorsArgument = _BEHAVIORS_DEFAULT,
112 default: _nomina.CharsetDefaultArgument = _CHARSET_DEFAULT,
113 supplement: _nomina.CharsetSupplementArgument = __.absent,
114 mimetype: _nomina.MimetypeAssumptionArgument = __.absent,
115 location: _nomina.LocationArgument = __.absent,
116) -> _CharsetResult:
117 ''' Detects character set candidates with confidence scores. '''
118 if b'' == content:
119 return _CharsetResult( charset = default, confidence = 1.0 )
120 for name in behaviors.charset_detectors_order:
121 detector = charset_detectors.get( name )
122 if detector is None: continue
123 result = detector( content, behaviors )
124 if result is NotImplemented: continue
125 break
126 else:
127 match behaviors.charset_on_detect_failure:
128 case _DetectFailureActions.Default:
129 return _CharsetResult( charset = default, confidence = 0.0 )
130 case _:
131 raise _exceptions.CharsetDetectFailure( location = location )
132 if result.charset is None:
133 if __.is_absent( mimetype ): return result
134 if not _mimetypes.is_textual_mimetype( mimetype ): return result
135 result = _charsets.trial_decode_as_confident(
136 content,
137 behaviors = behaviors,
138 supplement = supplement,
139 location = location )
140 return _normalize_charset_detection( content, behaviors, result )
141 return _confirm_charset_detection(
142 content, behaviors, result,
143 supplement = supplement, location = location )
146def detect_mimetype(
147 content: _nomina.Content, /, *,
148 behaviors: _BehaviorsArgument = _BEHAVIORS_DEFAULT,
149 default: _nomina.MimetypeDefaultArgument = _MIMETYPE_DEFAULT,
150 charset: _nomina.CharsetAssumptionArgument = __.absent,
151 location: _nomina.LocationArgument = __.absent,
152) -> str:
153 ''' Detects most probable MIME type. '''
154 nomargs: __.NominativeArguments = dict(
155 behaviors = behaviors,
156 default = default,
157 charset = charset,
158 location = location )
159 result = detect_mimetype_confidence( content, **nomargs )
160 return result.mimetype
163def detect_mimetype_confidence(
164 content: _nomina.Content, /, *,
165 behaviors: _BehaviorsArgument = _BEHAVIORS_DEFAULT,
166 default: _nomina.MimetypeDefaultArgument = _MIMETYPE_DEFAULT,
167 charset: _nomina.CharsetAssumptionArgument = __.absent,
168 location: _nomina.LocationArgument = __.absent,
169) -> _MimetypeResult:
170 ''' Detects MIME type candidates with confidence scores. '''
171 if b'' == content:
172 return _MimetypeResult( mimetype = 'text/plain', confidence = 1.0 )
173 result: _MimetypeResult | __.types.NotImplementedType = NotImplemented
174 for name in behaviors.mimetype_detectors_order:
175 detector = mimetype_detectors.get( name )
176 if detector is None: continue
177 result = detector( content, behaviors )
178 if result is not NotImplemented: break
179 try_charset = (
180 result is NotImplemented or (
181 not _mimetypes.is_textual_mimetype( result.mimetype )
182 and result.confidence < behaviors.trial_decode_confidence ) )
183 if try_charset and not __.is_absent( charset ):
184 # For charset validation, only try specified charset (no OS default)
185 behaviors_charset_only = __.dcls.replace(
186 behaviors, trial_codecs = ( _CodecSpecifiers.FromInference, ) )
187 result_from_charset = _detect_mimetype_from_charset(
188 content, behaviors_charset_only, charset,
189 default = default, location = location )
190 if result_from_charset.mimetype == 'text/plain':
191 return result_from_charset
192 if result is not NotImplemented: return result
193 match behaviors.mimetype_on_detect_failure:
194 case _DetectFailureActions.Default:
195 return _MimetypeResult( mimetype = default, confidence = 0.0 )
196 case _:
197 raise _exceptions.MimetypeDetectFailure( location = location )
200def _confirm_charset_detection( # noqa: PLR0911
201 content: _nomina.Content,
202 behaviors: _Behaviors,
203 result: _CharsetResult, /, *,
204 supplement: __.Absential[ str ] = __.absent,
205 location: __.Absential[ _nomina.Location ] = __.absent,
206) -> _CharsetResult:
207 result = _normalize_charset_detection( content, behaviors, result )
208 if result.charset is None: return result # pragma: no cover
209 charset, confidence = result.charset, result.confidence
210 if charset.startswith( 'utf-' ):
211 behaviors_no_fallback = __.dcls.replace(
212 behaviors,
213 trial_codecs = (
214 _CodecSpecifiers.UserSupplement,
215 _CodecSpecifiers.FromInference ) )
216 result = _charsets.trial_decode_as_confident(
217 content,
218 behaviors = behaviors_no_fallback,
219 supplement = supplement,
220 inference = charset,
221 confidence = confidence,
222 location = location )
223 return _normalize_charset_detection( content, behaviors, result )
224 match behaviors.trial_decode:
225 case _BehaviorTristate.Never: return result
226 case _: # Shake out false positives, like 'MacRoman'.
227 if charset == _charsets.discover_os_charset_default( ):
228 # Allow 'windows-1252', etc..., as appropriate.
229 return result # pragma: no cover
230 # Try UTF-8 to shake out false positives, but not OS default.
231 behaviors_utf8_only = __.dcls.replace(
232 behaviors,
233 trial_codecs = (
234 _CodecSpecifiers.UserSupplement,
235 _CodecSpecifiers.FromInference ) )
236 try:
237 _, result_ = _charsets.attempt_decodes(
238 content,
239 behaviors = behaviors_utf8_only,
240 inference = 'utf-8-sig',
241 supplement = supplement,
242 location = location )
243 except _exceptions.ContentDecodeFailure: return result
244 if charset == result_.charset: return result # pragma: no cover
245 result_ = _CharsetResult(
246 charset = result_.charset, confidence = confidence )
247 return _normalize_charset_detection( content, behaviors, result_ )
250def _detect_mimetype_from_charset(
251 content: _nomina.Content,
252 behaviors: _Behaviors,
253 charset: str, /, *,
254 default: str,
255 location: __.Absential[ _nomina.Location ],
256) -> _MimetypeResult:
257 should_error = False
258 match behaviors.mimetype_on_detect_failure:
259 case _DetectFailureActions.Default: pass
260 case _: should_error = True
261 error = _exceptions.MimetypeDetectFailure( location = location )
262 result_default = _MimetypeResult( mimetype = default, confidence = 0.0 )
263 match behaviors.trial_decode:
264 case _BehaviorTristate.Never:
265 if should_error: raise error
266 return result_default
267 case _: pass
268 try:
269 text, charset_result = _charsets.attempt_decodes(
270 content,
271 behaviors = behaviors, inference = charset, location = location )
272 except _exceptions.ContentDecodeFailure:
273 if should_error: raise error from None
274 return result_default
275 match behaviors.text_validate:
276 case _BehaviorTristate.Never:
277 if should_error: raise error
278 return result_default
279 case _: pass
280 if not _validation.PROFILE_TEXTUAL( text ):
281 if should_error: raise error
282 return result_default
283 return _MimetypeResult(
284 mimetype = 'text/plain', confidence = charset_result.confidence )
287def _detect_via_chardet(
288 content: _nomina.Content, behaviors: _Behaviors
289) -> _CharsetResult | __.types.NotImplementedType:
290 try: import chardet # pragma: no cover
291 except ImportError: return NotImplemented # pragma: no cover
292 result_ = chardet.detect( content )
293 charset, confidence = result_[ 'encoding' ], result_[ 'confidence' ]
294 return _CharsetResult( charset = charset, confidence = confidence )
296charset_detectors[ 'chardet' ] = _detect_via_chardet
299def _detect_via_charset_normalizer(
300 content: _nomina.Content, behaviors: _Behaviors
301) -> _CharsetResult | __.types.NotImplementedType:
302 try: import charset_normalizer # pragma: no cover
303 except ImportError: return NotImplemented # pragma: no cover
304 result_ = charset_normalizer.from_bytes( content ).best( )
305 charset = None if result_ is None else result_.encoding # pragma: no cover
306 confidence = _core.confidence_from_bytes_quantity(
307 content, behaviors = behaviors )
308 return _CharsetResult( charset = charset, confidence = confidence )
310charset_detectors[ 'charset-normalizer' ] = _detect_via_charset_normalizer
313def _detect_via_magic(
314 content: _nomina.Content, behaviors: _Behaviors
315) -> _MimetypeResult | __.types.NotImplementedType:
316 try: import magic # pragma: no cover
317 except ImportError: return NotImplemented # pragma: no cover
318 try: mimetype = magic.from_buffer( content, mime = True )
319 except Exception: return NotImplemented # pragma: no cover
320 confidence = _core.confidence_from_bytes_quantity(
321 content, behaviors = behaviors )
322 return _MimetypeResult( mimetype = mimetype, confidence = confidence )
324mimetype_detectors[ 'magic' ] = _detect_via_magic
327def _detect_via_puremagic(
328 content: _nomina.Content, behaviors: _Behaviors
329) -> _MimetypeResult | __.types.NotImplementedType:
330 try: import puremagic # pragma: no cover
331 except ImportError: return NotImplemented # pragma: no cover
332 try: mimetype = puremagic.from_string( content, mime = True )
333 except ( puremagic.PureError, ValueError ): # pragma: no cover
334 return NotImplemented
335 confidence = _core.confidence_from_bytes_quantity(
336 content, behaviors = behaviors )
337 return _MimetypeResult( mimetype = mimetype, confidence = confidence )
339mimetype_detectors[ 'puremagic' ] = _detect_via_puremagic
342def _normalize_charset_detection(
343 content: _nomina.Content, behaviors: _Behaviors, result: _CharsetResult
344) -> _CharsetResult:
345 if result.charset is None: return result # pragma: no cover
346 charset = _charsets.normalize_charset( result.charset )
347 # TODO? Consider endianness variations for BOM.
348 if charset == 'utf-8-sig' and not content.startswith( __.codecs.BOM ):
349 charset = 'utf-8'
350 return _CharsetResult( charset = charset, confidence = result.confidence )