Coverage for sources/detextive/charsets.py: 100%
49 statements
« prev ^ index » next coverage.py v7.10.6, created at 2025-09-20 18:02 +0000
« prev ^ index » next coverage.py v7.10.6, created at 2025-09-20 18:02 +0000
1# vim: set filetype=python fileencoding=utf-8:
2# -*- coding: utf-8 -*-
4#============================================================================#
5# #
6# Licensed under the Apache License, Version 2.0 (the "License"); #
7# you may not use this file except in compliance with the License. #
8# You may obtain a copy of the License at #
9# #
10# http://www.apache.org/licenses/LICENSE-2.0 #
11# #
12# Unless required by applicable law or agreed to in writing, software #
13# distributed under the License is distributed on an "AS IS" BASIS, #
14# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. #
15# See the License for the specific language governing permissions and #
16# limitations under the License. #
17# #
18#============================================================================#
21''' Management of bytes array decoding via trial character sets. '''
24from . import __
25from . import core as _core
26from . import exceptions as _exceptions
27from . import nomina as _nomina
29from .core import ( # isort: skip
30 BEHAVIORS_DEFAULT as _BEHAVIORS_DEFAULT,
31 BehaviorTristate as _BehaviorTristate,
32 Behaviors as _Behaviors,
33 CharsetResult as _CharsetResult,
34 CodecSpecifiers as _CodecSpecifiers,
35)
38def attempt_decodes(
39 content: _nomina.Content, /, *,
40 behaviors: _Behaviors = _BEHAVIORS_DEFAULT,
41 inference: __.Absential[ str ] = __.absent,
42 supplement: __.Absential[ str ] = __.absent,
43 location: __.Absential[ _nomina.Location ] = __.absent,
44) -> tuple[ str, _CharsetResult ]:
45 ''' Attempts to decode content with various character sets.
47 Will try character sets in the order specified by the trial codecs
48 listed on the behaviors object.
49 '''
50 confidence = _core.confidence_from_bytes_quantity(
51 content, behaviors = behaviors )
52 on_decode_error = behaviors.on_decode_error
53 trials: list[ str ] = [ ]
54 for codec in behaviors.trial_codecs:
55 match codec:
56 case _CodecSpecifiers.FromInference:
57 if __.is_absent( inference ): continue
58 charset = inference
59 case _CodecSpecifiers.OsDefault:
60 charset = discover_os_charset_default( )
61 case _CodecSpecifiers.PythonDefault:
62 charset = __.locale.getpreferredencoding( )
63 case _CodecSpecifiers.UserSupplement:
64 if __.is_absent( supplement ): continue
65 charset = supplement
66 case str( ): charset = codec
67 case _: continue
68 try: text = content.decode( charset, errors = on_decode_error )
69 except UnicodeDecodeError:
70 trials.append( charset )
71 continue
72 result = _CharsetResult( charset = charset, confidence = confidence )
73 return text, result
74 raise _exceptions.ContentDecodeFailure(
75 charset = trials, location = location )
78def discover_os_charset_default( ) -> str:
79 ''' Discovers default character set encoding from operating system. '''
80 discoverer = getattr(
81 __.locale, 'getencoding', __.sys.getfilesystemencoding )
82 return normalize_charset( discoverer( ) )
85def normalize_charset( charset: str ) -> str:
86 ''' Normalizes character set encoding names. '''
87 return __.codecs.lookup( charset ).name
90def trial_decode_as_confident( # noqa: PLR0913
91 content: _nomina.Content, /, *,
92 behaviors: _Behaviors = _BEHAVIORS_DEFAULT,
93 inference: __.Absential[ str ] = __.absent,
94 confidence: float = 0.0,
95 supplement: __.Absential[ str ] = __.absent,
96 location: __.Absential[ _nomina.Location ] = __.absent,
97) -> _CharsetResult:
98 ''' Performs trial decode of content.
100 Considers desired trial decode behavior and detection confidence.
101 '''
102 nomargs: __.NominativeArguments = dict(
103 behaviors = behaviors,
104 inference = inference,
105 supplement = supplement,
106 location = location )
107 should_decode = False
108 match behaviors.trial_decode:
109 case _BehaviorTristate.Always: should_decode = True
110 case _BehaviorTristate.AsNeeded:
111 should_decode = confidence < behaviors.trial_decode_confidence
112 case _BehaviorTristate.Never: pass
113 if should_decode:
114 _, result = attempt_decodes( content, **nomargs )
115 return result
116 if __.is_absent( inference ):
117 raise _exceptions.CharsetDetectFailure( location = location )
118 return _CharsetResult( charset = inference, confidence = confidence )