Coverage for sources / detextive / charsets.py: 100%
55 statements
« prev ^ index » next coverage.py v7.13.4, created at 2026-02-14 04:38 +0000
« prev ^ index » next coverage.py v7.13.4, created at 2026-02-14 04:38 +0000
1# vim: set filetype=python fileencoding=utf-8:
2# -*- coding: utf-8 -*-
4#============================================================================#
5# #
6# Licensed under the Apache License, Version 2.0 (the "License"); #
7# you may not use this file except in compliance with the License. #
8# You may obtain a copy of the License at #
9# #
10# http://www.apache.org/licenses/LICENSE-2.0 #
11# #
12# Unless required by applicable law or agreed to in writing, software #
13# distributed under the License is distributed on an "AS IS" BASIS, #
14# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. #
15# See the License for the specific language governing permissions and #
16# limitations under the License. #
17# #
18#============================================================================#
21''' Management of bytes array decoding via trial character sets. '''
24from . import __
25from . import core as _core
26from . import exceptions as _exceptions
27from . import nomina as _nomina
29from .core import ( # isort: skip
30 BEHAVIORS_DEFAULT as _BEHAVIORS_DEFAULT,
31 BehaviorTristate as _BehaviorTristate,
32 Behaviors as _Behaviors,
33 CharsetResult as _CharsetResult,
34 CodecSpecifiers as _CodecSpecifiers,
35)
38def attempt_decodes( # noqa: C901,PLR0912,PLR0913,PLR0915
39 content: _nomina.Content, /, *,
40 behaviors: _Behaviors = _BEHAVIORS_DEFAULT,
41 inference: __.Absential[ str ] = __.absent,
42 supplement: __.Absential[ str ] = __.absent,
43 location: __.Absential[ _nomina.Location ] = __.absent,
44 validator: __.Absential[
45 __.cabc.Callable[ [ str, _CharsetResult ], None ]
46 ] = __.absent,
47) -> tuple[ str, _CharsetResult ]:
48 ''' Attempts to decode content with various character sets.
50 Will try character sets in the order specified by the trial codecs
51 listed on the behaviors object.
52 '''
53 confidence = _core.confidence_from_bytes_quantity(
54 content, behaviors = behaviors )
55 on_decode_error = behaviors.on_decode_error
56 trials: set[ str ] = set( )
57 for codec in behaviors.trial_codecs:
58 match codec:
59 case _CodecSpecifiers.FromInference:
60 if __.is_absent( inference ): continue
61 charset = inference
62 case _CodecSpecifiers.OsDefault:
63 charset = discover_os_charset_default( )
64 case _CodecSpecifiers.PythonDefault:
65 charset = __.locale.getpreferredencoding( )
66 case _CodecSpecifiers.UserSupplement:
67 if __.is_absent( supplement ): continue
68 charset = supplement
69 case str( ): charset = codec
70 case _: continue
71 charset = normalize_charset(
72 charset, bom_cognizant = behaviors.remove_bom )
73 if charset in trials: continue
74 try: text = content.decode( charset, errors = on_decode_error )
75 except UnicodeDecodeError: continue
76 finally: trials.add( charset )
77 result = _CharsetResult( charset = charset, confidence = confidence )
78 if not __.is_absent( validator ):
79 try: validator( text, result )
80 except _exceptions.TextInvalidity: continue
81 return text, result
82 raise _exceptions.ContentDecodeFailure(
83 charset = tuple( trials ), location = location )
86def discover_os_charset_default( ) -> str:
87 ''' Discovers default character set encoding from operating system. '''
88 discoverer = getattr(
89 __.locale, 'getencoding', __.sys.getfilesystemencoding )
90 return normalize_charset( discoverer( ) )
93def normalize_charset( charset: str, bom_cognizant: bool = False ) -> str:
94 ''' Normalizes character set encoding names. '''
95 charset_ = __.codecs.lookup( charset ).name
96 if bom_cognizant and charset_ == 'utf-8': return 'utf-8-sig'
97 return charset_
100def trial_decode_as_confident( # noqa: PLR0913
101 content: _nomina.Content, /, *,
102 behaviors: _Behaviors = _BEHAVIORS_DEFAULT,
103 inference: __.Absential[ str ] = __.absent,
104 confidence: float = 0.0,
105 supplement: __.Absential[ str ] = __.absent,
106 location: __.Absential[ _nomina.Location ] = __.absent,
107) -> _CharsetResult:
108 ''' Performs trial decode of content.
110 Considers desired trial decode behavior and detection confidence.
111 '''
112 nomargs: __.NominativeArguments = dict(
113 behaviors = behaviors,
114 inference = inference,
115 supplement = supplement,
116 location = location )
117 should_decode = False
118 match behaviors.trial_decode:
119 case _BehaviorTristate.Always: should_decode = True
120 case _BehaviorTristate.AsNeeded:
121 should_decode = confidence < behaviors.trial_decode_confidence
122 case _BehaviorTristate.Never: pass
123 if should_decode:
124 _, result = attempt_decodes( content, **nomargs )
125 return result
126 if __.is_absent( inference ):
127 raise _exceptions.CharsetDetectFailure( location = location )
128 return _CharsetResult( charset = inference, confidence = confidence )