Coverage for sources / detextive / decoders.py: 100%
80 statements
« prev ^ index » next coverage.py v7.13.4, created at 2026-02-14 04:38 +0000
« prev ^ index » next coverage.py v7.13.4, created at 2026-02-14 04:38 +0000
1# vim: set filetype=python fileencoding=utf-8:
2# -*- coding: utf-8 -*-
4#============================================================================#
5# #
6# Licensed under the Apache License, Version 2.0 (the "License"); #
7# you may not use this file except in compliance with the License. #
8# You may obtain a copy of the License at #
9# #
10# http://www.apache.org/licenses/LICENSE-2.0 #
11# #
12# Unless required by applicable law or agreed to in writing, software #
13# distributed under the License is distributed on an "AS IS" BASIS, #
14# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. #
15# See the License for the specific language governing permissions and #
16# limitations under the License. #
17# #
18#============================================================================#
21''' Conversion of bytes arrays to Unicode text. '''
24from . import __
25from . import charsets as _charsets
26from . import detectors as _detectors
27from . import exceptions as _exceptions
28from . import inference as _inference
29from . import lineseparators as _lineseparators
30from . import mimetypes as _mimetypes
31from . import nomina as _nomina
32from . import validation as _validation
34from .core import ( # isort: skip
35 BEHAVIORS_DEFAULT as _BEHAVIORS_DEFAULT,
36 BehaviorTristate as _BehaviorTristate,
37 BehaviorsArgument as _BehaviorsArgument,
38 CharsetResult as _CharsetResult,
39 CodecSpecifiers as _CodecSpecifiers,
40 MimetypeResult as _MimetypeResult,
41)
44_MIMETYPE_DEFAULT_TEXTUAL = 'text/plain'
47class DecodeInformResult( __.immut.DataclassObject ):
48 ''' Decoded text with supplemental inference metadata. '''
50 text: __.typx.Annotated[
51 str, __.ddoc.Doc( ''' Decoded text content. ''' )
52 ]
53 charset: __.typx.Annotated[
54 _CharsetResult, __.ddoc.Doc( ''' Charset used for decoding. ''' )
55 ]
56 mimetype: __.typx.Annotated[
57 _MimetypeResult, __.ddoc.Doc( ''' Inferred MIME type metadata. ''' )
58 ]
59 linesep: __.typx.Annotated[
60 __.typx.Optional[ _lineseparators.LineSeparators ],
61 __.ddoc.Doc( ''' Detected line separator from content sample. ''' ),
62 ]
65def decode( # noqa: PLR0913
66 content: _nomina.Content, /, *,
67 behaviors: _BehaviorsArgument = _BEHAVIORS_DEFAULT,
68 profile: _validation.ProfileArgument = _validation.PROFILE_TEXTUAL,
69 http_content_type: _nomina.HttpContentTypeArgument = __.absent,
70 location: _nomina.LocationArgument = __.absent,
71 charset_supplement: _nomina.CharsetSupplementArgument = __.absent,
72) -> str:
73 ''' Decodes bytes array to Unicode text.
75 Uses trial decoding and validation; does not provide default-return
76 semantics. The ``charset_supplement`` parameter is a trial hint and
77 not a fallback return value.
78 '''
79 _, httpct_charset = _parse_http_content_type( http_content_type )
80 return _decode_content_charset_result(
81 content, behaviors, profile,
82 httpct_charset = httpct_charset,
83 location = location,
84 charset_supplement = charset_supplement )[ 0 ]
87def decode_inform( # noqa: PLR0913
88 content: _nomina.Content, /, *,
89 behaviors: _BehaviorsArgument = _BEHAVIORS_DEFAULT,
90 profile: _validation.ProfileArgument = _validation.PROFILE_TEXTUAL,
91 mimetype_default: _nomina.MimetypeDefaultArgument = (
92 _MIMETYPE_DEFAULT_TEXTUAL ),
93 http_content_type: _nomina.HttpContentTypeArgument = __.absent,
94 location: _nomina.LocationArgument = __.absent,
95 charset_supplement: _nomina.CharsetSupplementArgument = __.absent,
96) -> DecodeInformResult:
97 ''' Decodes bytes and returns supplemental inference metadata. '''
98 httpct_mimetype, httpct_charset = (
99 _parse_http_content_type( http_content_type ) )
100 text, charset_result = _decode_content_charset_result(
101 content, behaviors, profile,
102 httpct_charset = httpct_charset,
103 location = location,
104 charset_supplement = charset_supplement )
105 mimetype_result = _infer_mimetype(
106 content, behaviors,
107 mimetype_default = mimetype_default,
108 httpct_mimetype = httpct_mimetype,
109 location = location,
110 charset = charset_result.charset )
111 linesep = _lineseparators.LineSeparators.detect_bytes( content )
112 return DecodeInformResult(
113 text = text,
114 charset = charset_result,
115 mimetype = mimetype_result,
116 linesep = linesep )
119def _attempt_decode_http_content_type(
120 content: _nomina.Content,
121 behaviors: _BehaviorsArgument,
122 profile: _validation.ProfileArgument, /, *,
123 httpct_charset: __.Absential[ __.typx.Optional[ str ] ],
124 location: _nomina.LocationArgument,
125) -> __.Absential[ tuple[ str, _CharsetResult ] ]:
126 error = _exceptions.ContentDecodeImpossibility( location = location )
127 if httpct_charset is None: raise error
128 if __.is_absent( httpct_charset ): return __.absent
129 behaviors_ = __.dcls.replace(
130 behaviors, trial_codecs = ( _CodecSpecifiers.FromInference, ) )
131 try:
132 text, result = _charsets.attempt_decodes(
133 content,
134 behaviors = behaviors_,
135 inference = httpct_charset,
136 location = location )
137 except _exceptions.ContentDecodeFailure: return __.absent
138 _validate_text(
139 text, result.confidence,
140 behaviors = behaviors, profile = profile, location = location )
141 return text, result
144def _decode_content_charset_result( # noqa: PLR0913
145 content: _nomina.Content,
146 behaviors: _BehaviorsArgument,
147 profile: _validation.ProfileArgument, /, *,
148 httpct_charset: __.Absential[ __.typx.Optional[ str ] ],
149 location: _nomina.LocationArgument,
150 charset_supplement: _nomina.CharsetSupplementArgument,
151) -> tuple[ str, _CharsetResult ]:
152 if content == b'':
153 return '', _CharsetResult( charset = 'utf-8', confidence = 1.0 )
154 charset: __.Absential[ str ] = __.absent
155 result: __.Absential[ _CharsetResult ] = __.absent
156 httpct_result: __.Absential[ tuple[ str, _CharsetResult ] ] = __.absent
157 httpct_result = _attempt_decode_http_content_type(
158 content, behaviors, profile,
159 httpct_charset = httpct_charset, location = location )
160 if not __.is_absent( httpct_result ): return httpct_result
161 behaviors_ = __.dcls.replace(
162 behaviors, trial_decode = _BehaviorTristate.Never )
163 with __.ctxl.suppress( _exceptions.CharsetDetectFailure ):
164 result = _detectors.detect_charset_confidence(
165 content,
166 behaviors = behaviors_,
167 supplement = charset_supplement,
168 location = location )
169 if ( result.charset
170 and result.confidence >= behaviors.trial_decode_confidence
171 ): charset = result.charset
172 validator = __.funct.partial(
173 _validate_text_in_decode_attempt,
174 behaviors = behaviors,
175 profile = profile,
176 location = location )
177 return _charsets.attempt_decodes(
178 content,
179 behaviors = behaviors,
180 inference = charset,
181 supplement = charset_supplement,
182 location = location,
183 validator = validator )
186def _infer_mimetype( # noqa: PLR0913
187 content: _nomina.Content,
188 behaviors: _BehaviorsArgument, /, *,
189 mimetype_default: _nomina.MimetypeDefaultArgument,
190 httpct_mimetype: __.Absential[ str ],
191 location: _nomina.LocationArgument,
192 charset: __.typx.Optional[ str ],
193) -> _MimetypeResult:
194 charset_ = __.absent if charset is None else charset
195 if ( not __.is_absent( httpct_mimetype )
196 and _mimetypes.is_textual_mimetype( httpct_mimetype )
197 ):
198 return _MimetypeResult( mimetype = httpct_mimetype, confidence = 0.9 )
199 result: __.Absential[ _MimetypeResult ] = __.absent
200 if not __.is_absent( location ):
201 mimetype = _mimetypes.mimetype_from_location( location )
202 if ( not __.is_absent( mimetype )
203 and _mimetypes.is_textual_mimetype( mimetype )
204 ):
205 return _MimetypeResult( mimetype = mimetype, confidence = 0.9 )
206 if behaviors.mimetype_detect:
207 result = _detectors.detect_mimetype_confidence(
208 content,
209 behaviors = behaviors,
210 default = mimetype_default,
211 charset = charset_,
212 location = location )
213 if __.is_absent( result ):
214 return _MimetypeResult( mimetype = mimetype_default, confidence = 1.0 )
215 if _mimetypes.is_textual_mimetype( result.mimetype ): return result
216 return _MimetypeResult( mimetype = mimetype_default, confidence = 1.0 )
219def _parse_http_content_type(
220 http_content_type: _nomina.HttpContentTypeArgument
221) -> tuple[ __.Absential[ str ], __.Absential[ __.typx.Optional[ str ] ] ]:
222 if __.is_absent( http_content_type ):
223 return __.absent, __.absent
224 return _inference.parse_http_content_type( http_content_type )
227def _validate_text(
228 text: str, confidence: float, /, *,
229 behaviors: _BehaviorsArgument,
230 profile: _validation.ProfileArgument,
231 location: _nomina.LocationArgument,
232) -> None:
233 error = _exceptions.TextInvalidity( location = location )
234 should_validate = False
235 match behaviors.text_validate:
236 case _BehaviorTristate.Always:
237 should_validate = True
238 case _BehaviorTristate.AsNeeded:
239 should_validate = confidence < behaviors.text_validate_confidence
240 case _BehaviorTristate.Never: pass
241 if should_validate and not profile( text ): raise error
244def _validate_text_in_decode_attempt(
245 text: str, result: _CharsetResult, /, *,
246 behaviors: _BehaviorsArgument,
247 profile: _validation.ProfileArgument,
248 location: _nomina.LocationArgument,
249) -> None:
250 _validate_text(
251 text, 0.0,
252 behaviors = behaviors,
253 profile = profile,
254 location = location )