Coverage for sources / detextive / inference.py: 98%
81 statements
« prev ^ index » next coverage.py v7.13.4, created at 2026-02-14 04:38 +0000
« prev ^ index » next coverage.py v7.13.4, created at 2026-02-14 04:38 +0000
1# vim: set filetype=python fileencoding=utf-8:
2# -*- coding: utf-8 -*-
4#============================================================================#
5# #
6# Licensed under the Apache License, Version 2.0 (the "License"); #
7# you may not use this file except in compliance with the License. #
8# You may obtain a copy of the License at #
9# #
10# http://www.apache.org/licenses/LICENSE-2.0 #
11# #
12# Unless required by applicable law or agreed to in writing, software #
13# distributed under the License is distributed on an "AS IS" BASIS, #
14# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. #
15# See the License for the specific language governing permissions and #
16# limitations under the License. #
17# #
18#============================================================================#
21''' Core detection function implementations. '''
24from . import __
25from . import charsets as _charsets
26from . import detectors as _detectors
27from . import exceptions as _exceptions
28from . import mimetypes as _mimetypes
29from . import nomina as _nomina
31from .core import ( # isort: skip
32 BEHAVIORS_DEFAULT as _BEHAVIORS_DEFAULT,
33 CHARSET_DEFAULT as _CHARSET_DEFAULT,
34 MIMETYPE_DEFAULT as _MIMETYPE_DEFAULT,
35 Behaviors as _Behaviors,
36 BehaviorsArgument as _BehaviorsArgument,
37 CharsetResult as _CharsetResult,
38 CodecSpecifiers as _CodecSpecifiers,
39 MimetypeResult as _MimetypeResult,
40)
43def infer_charset( # noqa: PLR0913
44 content: _nomina.Content, /, *,
45 behaviors: _BehaviorsArgument = _BEHAVIORS_DEFAULT,
46 charset_default: _nomina.CharsetDefaultArgument = _CHARSET_DEFAULT,
47 http_content_type: _nomina.HttpContentTypeArgument = __.absent,
48 charset_supplement: _nomina.CharsetSupplementArgument = __.absent,
49 mimetype_supplement: _nomina.MimetypeSupplementArgument = __.absent,
50 location: _nomina.LocationArgument = __.absent,
51) -> __.typx.Optional[ str ]:
52 ''' Infers charset through various means.
54 ``charset_default`` is the returned fallback when inference cannot
55 determine another charset. ``charset_supplement`` is a user-supplied
56 hint used during inference/validation.
57 '''
58 result = infer_charset_confidence(
59 content,
60 behaviors = behaviors,
61 charset_default = charset_default,
62 http_content_type = http_content_type,
63 charset_supplement = charset_supplement,
64 mimetype_supplement = mimetype_supplement,
65 location = location )
66 return result.charset
69def infer_charset_confidence( # noqa: PLR0913
70 content: _nomina.Content, /, *,
71 behaviors: _BehaviorsArgument = _BEHAVIORS_DEFAULT,
72 charset_default: _nomina.CharsetDefaultArgument = _CHARSET_DEFAULT,
73 http_content_type: _nomina.HttpContentTypeArgument = __.absent,
74 charset_supplement: _nomina.CharsetSupplementArgument = __.absent,
75 mimetype_supplement: _nomina.MimetypeSupplementArgument = __.absent,
76 location: _nomina.LocationArgument = __.absent,
77) -> _CharsetResult:
78 ''' Infers charset with confidence level through various means.
80 ``charset_default`` is the returned fallback when inference cannot
81 determine another charset. ``charset_supplement`` is a user-supplied
82 hint used during inference/validation. ``http_content_type`` is
83 parsed when supplied, independent of detector enablement behavior.
84 '''
85 if content == b'':
86 return _CharsetResult( charset = 'utf-8', confidence = 1.0 )
87 should_detect = behaviors.charset_detect
88 result = __.absent
89 mimetype = mimetype_supplement
90 http_content_type = (
91 '' if __.is_absent( http_content_type ) else http_content_type )
92 if http_content_type:
93 mimetype_result, charset_result = _validate_http_content_type(
94 content, behaviors, http_content_type,
95 charset_supplement = charset_supplement, location = location )
96 if not __.is_absent( mimetype_result ):
97 mimetype = mimetype_result.mimetype
98 if ( not __.is_absent( charset_result )
99 and charset_result.charset is not None
100 ): return charset_result
101 if __.is_absent( result ) and should_detect:
102 result = _detectors.detect_charset_confidence(
103 content, default = charset_default, mimetype = mimetype )
104 if __.is_absent( result ):
105 raise _exceptions.CharsetInferFailure( location = location )
106 return result
109def infer_mimetype_charset( # noqa: PLR0913
110 content: _nomina.Content, /, *,
111 behaviors: _BehaviorsArgument = _BEHAVIORS_DEFAULT,
112 charset_default: _nomina.CharsetDefaultArgument = _CHARSET_DEFAULT,
113 mimetype_default: _nomina.MimetypeDefaultArgument = _MIMETYPE_DEFAULT,
114 http_content_type: _nomina.HttpContentTypeArgument = __.absent,
115 location: _nomina.LocationArgument = __.absent,
116 charset_supplement: _nomina.CharsetSupplementArgument = __.absent,
117 mimetype_supplement: _nomina.MimetypeSupplementArgument = __.absent,
118) -> tuple[ str, __.typx.Optional[ str ] ]:
119 ''' Infers MIME type and charset through various means.
121 ``*_default`` values are returned fallbacks on inference failure.
122 ``*_supplement`` values are user-supplied hints used to guide
123 inference before fallback behavior is applied.
124 '''
125 mimetype_result, charset_result = (
126 infer_mimetype_charset_confidence(
127 content,
128 behaviors = behaviors,
129 charset_default = charset_default,
130 mimetype_default = mimetype_default,
131 http_content_type = http_content_type,
132 location = location,
133 charset_supplement = charset_supplement,
134 mimetype_supplement = mimetype_supplement ) )
135 return mimetype_result.mimetype , charset_result.charset
138def infer_mimetype_charset_confidence( # noqa: PLR0913
139 content: _nomina.Content, /, *,
140 behaviors: _BehaviorsArgument = _BEHAVIORS_DEFAULT,
141 charset_default: _nomina.CharsetDefaultArgument = _CHARSET_DEFAULT,
142 mimetype_default: _nomina.MimetypeDefaultArgument = _MIMETYPE_DEFAULT,
143 http_content_type: _nomina.HttpContentTypeArgument = __.absent,
144 location: _nomina.LocationArgument = __.absent,
145 charset_supplement: _nomina.CharsetSupplementArgument = __.absent,
146 mimetype_supplement: _nomina.MimetypeSupplementArgument = __.absent,
147) -> tuple[ _MimetypeResult, _CharsetResult ]:
148 ''' Infers MIME type and charset through various means. '''
149 should_detect_charset = behaviors.charset_detect
150 should_detect_mimetype = behaviors.mimetype_detect
151 charset_result: __.Absential[ _CharsetResult ] = __.absent
152 mimetype_result: __.Absential[ _MimetypeResult ] = __.absent
153 http_content_type = (
154 '' if __.is_absent( http_content_type ) else http_content_type )
155 if http_content_type:
156 mimetype_result, charset_result = _validate_http_content_type(
157 content, behaviors, http_content_type,
158 charset_supplement = charset_supplement, location = location )
159 if __.is_absent( mimetype_result ) and not __.is_absent( location ):
160 mimetype = _mimetypes.mimetype_from_location( location )
161 if not __.is_absent( mimetype ):
162 mimetype_result = _MimetypeResult(
163 mimetype = mimetype, confidence = 0.9 )
164 if __.is_absent( mimetype_result ) and should_detect_mimetype:
165 charset = (
166 charset_supplement
167 if __.is_absent( charset_result ) or charset_result.charset is None
168 else charset_result.charset )
169 mimetype_result = _detectors.detect_mimetype_confidence(
170 content,
171 behaviors = behaviors,
172 default = mimetype_default,
173 charset = charset,
174 location = location )
175 if __.is_absent( charset_result ) and should_detect_charset:
176 mimetype = (
177 mimetype_supplement if __.is_absent( mimetype_result )
178 else mimetype_result.mimetype )
179 charset_result = _detectors.detect_charset_confidence(
180 content,
181 behaviors = behaviors,
182 default = charset_default,
183 mimetype = mimetype,
184 location = location )
185 if __.is_absent( charset_result ):
186 raise _exceptions.CharsetInferFailure( location = location )
187 if __.is_absent( mimetype_result ):
188 raise _exceptions.MimetypeInferFailure( location = location )
189 return mimetype_result, charset_result
192def parse_http_content_type(
193 http_content_type: str
194) -> tuple[ __.Absential[ str ], __.Absential[ __.typx.Optional[ str ] ] ]:
195 ''' Parses RFC 9110 HTTP Content-Type header.
197 Returns normalized MIME type and charset, if able to be extracted.
198 Marks either as absent, if not able to be extracted.
199 '''
200 mimetype, *params = http_content_type.split( ';' )
201 if mimetype:
202 mimetype = mimetype.strip( ).lower( )
203 if _mimetypes.is_textual_mimetype( mimetype ):
204 for param in params:
205 name, separator, value = param.partition( '=' )
206 if separator != '=': continue
207 if 'charset' == name.strip( ).lower( ):
208 charset = value.strip( ).lower( )
209 if charset: return mimetype, charset 209 ↛ 210line 209 didn't jump to line 210 because the condition on line 209 was always true
210 return mimetype, __.absent
211 return mimetype, __.absent
212 return mimetype, None # non-textual type, charset irrelevant
213 return __.absent, __.absent
216def validate_httpct_charset(
217 content: _nomina.Content,
218 charset: str, /, *,
219 behaviors: _Behaviors = _BEHAVIORS_DEFAULT,
220) -> __.Absential[ _CharsetResult ]:
221 behaviors_ = __.dcls.replace(
222 behaviors, trial_codecs = ( _CodecSpecifiers.FromInference, ) )
223 return _charsets.trial_decode_as_confident(
224 content, behaviors = behaviors_, inference = charset )
227def _validate_http_content_type(
228 content: _nomina.Content,
229 behaviors: _Behaviors,
230 http_content_type: str, /, *,
231 charset_supplement: __.Absential[ str ] = __.absent,
232 location: __.Absential[ _nomina.Location ] = __.absent,
233) -> tuple[ __.Absential[ _MimetypeResult ], __.Absential[ _CharsetResult ] ]:
234 mimetype, charset = parse_http_content_type( http_content_type )
235 if __.is_absent( charset ):
236 charset_result = __.absent
237 elif charset is None:
238 charset_result = _CharsetResult( charset = None, confidence = 0.9 )
239 else:
240 charset_result = validate_httpct_charset(
241 content, charset, behaviors = behaviors )
242 if __.is_absent( mimetype ): mimetype_result = __.absent
243 else:
244 mimetype_result = _MimetypeResult(
245 mimetype = mimetype, confidence = 0.9 )
246 return mimetype_result, charset_result