Coverage for sources / detextive / charsets.py: 100%
94 statements
« prev ^ index » next coverage.py v7.13.4, created at 2026-02-17 06:15 +0000
« prev ^ index » next coverage.py v7.13.4, created at 2026-02-17 06:15 +0000
1# vim: set filetype=python fileencoding=utf-8:
2# -*- coding: utf-8 -*-
4#============================================================================#
5# #
6# Licensed under the Apache License, Version 2.0 (the "License"); #
7# you may not use this file except in compliance with the License. #
8# You may obtain a copy of the License at #
9# #
10# http://www.apache.org/licenses/LICENSE-2.0 #
11# #
12# Unless required by applicable law or agreed to in writing, software #
13# distributed under the License is distributed on an "AS IS" BASIS, #
14# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. #
15# See the License for the specific language governing permissions and #
16# limitations under the License. #
17# #
18#============================================================================#
21''' Management of bytes array decoding via trial character sets. '''
24from . import __
25from . import core as _core
26from . import exceptions as _exceptions
27from . import nomina as _nomina
29from .core import ( # isort: skip
30 BEHAVIORS_DEFAULT as _BEHAVIORS_DEFAULT,
31 BehaviorTristate as _BehaviorTristate,
32 Behaviors as _Behaviors,
33 CharsetResult as _CharsetResult,
34 CodecSpecifiers as _CodecSpecifiers,
35)
38def attempt_decodes( # noqa: PLR0913
39 content: _nomina.Content, /, *,
40 behaviors: _Behaviors = _BEHAVIORS_DEFAULT,
41 inference: __.Absential[ str ] = __.absent,
42 supplement: __.Absential[ str ] = __.absent,
43 location: __.Absential[ _nomina.Location ] = __.absent,
44 validator: __.Absential[
45 __.cabc.Callable[ [ str, _CharsetResult ], None ]
46 ] = __.absent,
47) -> tuple[ str, _CharsetResult ]:
48 ''' Attempts to decode content with various character sets.
50 Will try character sets in the order specified by the trial codecs
51 listed on the behaviors object.
52 '''
53 confidence = _core.confidence_from_bytes_quantity(
54 content, behaviors = behaviors )
55 trial_codecs = _collect_trial_codecs(
56 content,
57 behaviors = behaviors,
58 inference = inference,
59 supplement = supplement )
60 trials: set[ str ] = set( )
61 for trial_codec in trial_codecs:
62 try: text = content.decode(
63 trial_codec, errors = behaviors.on_decode_error )
64 except UnicodeDecodeError: continue
65 finally: trials.add( trial_codec )
66 result = _CharsetResult(
67 charset = normalize_charset_for_content( content, trial_codec ),
68 confidence = confidence )
69 if not __.is_absent( validator ):
70 try: validator( text, result )
71 except _exceptions.TextInvalidity: continue
72 return text, result
73 raise _exceptions.ContentDecodeFailure(
74 charset = tuple( trials ), location = location )
77def discover_os_charset_default( ) -> str:
78 ''' Discovers default character set encoding from operating system. '''
79 discoverer = getattr(
80 __.locale, 'getencoding', __.sys.getfilesystemencoding )
81 return normalize_charset( discoverer( ) )
84def normalize_charset( charset: str, bom_cognizant: bool = False ) -> str:
85 ''' Normalizes character set encoding names. '''
86 charset_ = __.codecs.lookup( charset ).name
87 if bom_cognizant and charset_ == 'utf-8': return 'utf-8-sig'
88 return charset_
91def normalize_charset_for_content(
92 content: _nomina.Content, charset: str
93) -> str:
94 ''' Normalizes charset reporting based on byte-order mark provenance. '''
95 charset_ = normalize_charset( charset )
96 bom_charset = _discover_utf_bom_charset( content )
97 if charset_ in ( 'utf-8', 'utf-8-sig' ):
98 if bom_charset == 'utf-8-sig': return 'utf-8-sig'
99 return 'utf-8'
100 for family in ( 'utf-16', 'utf-32' ):
101 if not charset_.startswith( family ): continue
102 if bom_charset in ( f"{family}-le", f"{family}-be" ):
103 return family
104 return charset_
105 return charset_
108def trial_decode_as_confident( # noqa: PLR0913
109 content: _nomina.Content, /, *,
110 behaviors: _Behaviors = _BEHAVIORS_DEFAULT,
111 inference: __.Absential[ str ] = __.absent,
112 confidence: float = 0.0,
113 supplement: __.Absential[ str ] = __.absent,
114 location: __.Absential[ _nomina.Location ] = __.absent,
115) -> _CharsetResult:
116 ''' Performs trial decode of content.
118 Considers desired trial decode behavior and detection confidence.
119 '''
120 nomargs: __.NominativeArguments = dict(
121 behaviors = behaviors,
122 inference = inference,
123 supplement = supplement,
124 location = location )
125 should_decode = False
126 match behaviors.trial_decode:
127 case _BehaviorTristate.Always: should_decode = True
128 case _BehaviorTristate.AsNeeded:
129 should_decode = confidence < behaviors.trial_decode_confidence
130 case _BehaviorTristate.Never: pass
131 if should_decode:
132 _, result = attempt_decodes( content, **nomargs )
133 return result
134 if __.is_absent( inference ):
135 raise _exceptions.CharsetDetectFailure( location = location )
136 return _CharsetResult( charset = inference, confidence = confidence )
139def _collect_trial_codecs(
140 content: _nomina.Content, /, *,
141 behaviors: _Behaviors,
142 inference: __.Absential[ str ],
143 supplement: __.Absential[ str ],
144) -> tuple[ str, ... ]:
145 codecs: list[ str ] = [ ] # No set needed; this candidate list is tiny.
146 for codec in behaviors.trial_codecs:
147 charset = _resolve_trial_codec(
148 codec, inference = inference, supplement = supplement )
149 if __.is_absent( charset ): continue
150 charset = normalize_charset( charset )
151 if _is_ambiguous_utf_trial( content, charset, behaviors ): continue
152 if behaviors.remove_bom and charset == 'utf-8': charset = 'utf-8-sig'
153 if charset in codecs: continue
154 codecs.append( charset )
155 return tuple( codecs )
158def _discover_utf_bom_charset(
159 content: _nomina.Content
160) -> __.typx.Optional[ str ]:
161 # Must check UTF-32 markers first, since they prefix-match UTF-16 markers.
162 if content.startswith( __.codecs.BOM_UTF32_LE ): return 'utf-32-le'
163 if content.startswith( __.codecs.BOM_UTF32_BE ): return 'utf-32-be'
164 if content.startswith( __.codecs.BOM_UTF8 ): return 'utf-8-sig'
165 if content.startswith( __.codecs.BOM_UTF16_LE ): return 'utf-16-le'
166 if content.startswith( __.codecs.BOM_UTF16_BE ): return 'utf-16-be'
167 return None
170def _is_ambiguous_utf_trial(
171 content: _nomina.Content, charset: str, behaviors: _Behaviors
172) -> bool:
173 if not behaviors.utf_16_32_requires_byte_order: return False
174 match charset:
175 case 'utf-16':
176 bom_charset = _discover_utf_bom_charset( content )
177 return bom_charset not in ( 'utf-16-le', 'utf-16-be' )
178 case 'utf-32':
179 bom_charset = _discover_utf_bom_charset( content )
180 return bom_charset not in ( 'utf-32-le', 'utf-32-be' )
181 case _: return False
184def _resolve_trial_codec(
185 codec: __.typx.Any, /, *,
186 inference: __.Absential[ str ],
187 supplement: __.Absential[ str ],
188) -> __.Absential[ str ]:
189 charset: __.Absential[ str ] = __.absent
190 match codec:
191 case _CodecSpecifiers.FromInference:
192 if not __.is_absent( inference ): charset = inference
193 case _CodecSpecifiers.OsDefault:
194 charset = discover_os_charset_default( )
195 case _CodecSpecifiers.PythonDefault:
196 charset = __.locale.getpreferredencoding( )
197 case _CodecSpecifiers.UserSupplement:
198 if not __.is_absent( supplement ): charset = supplement
199 case str( ): charset = codec
200 case _: pass
201 return charset