Coverage for sources/detextive/inference.py: 100%
88 statements
« prev ^ index » next coverage.py v7.10.6, created at 2025-09-20 18:02 +0000
« prev ^ index » next coverage.py v7.10.6, created at 2025-09-20 18:02 +0000
1# vim: set filetype=python fileencoding=utf-8:
2# -*- coding: utf-8 -*-
4#============================================================================#
5# #
6# Licensed under the Apache License, Version 2.0 (the "License"); #
7# you may not use this file except in compliance with the License. #
8# You may obtain a copy of the License at #
9# #
10# http://www.apache.org/licenses/LICENSE-2.0 #
11# #
12# Unless required by applicable law or agreed to in writing, software #
13# distributed under the License is distributed on an "AS IS" BASIS, #
14# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. #
15# See the License for the specific language governing permissions and #
16# limitations under the License. #
17# #
18#============================================================================#
21''' Core detection function implementations. '''
24from . import __
25from . import charsets as _charsets
26from . import detectors as _detectors
27from . import exceptions as _exceptions
28from . import mimetypes as _mimetypes
29from . import nomina as _nomina
31from .core import ( # isort: skip
32 BEHAVIORS_DEFAULT as _BEHAVIORS_DEFAULT,
33 CHARSET_DEFAULT as _CHARSET_DEFAULT,
34 MIMETYPE_DEFAULT as _MIMETYPE_DEFAULT,
35 BehaviorTristate as _BehaviorTristate,
36 Behaviors as _Behaviors,
37 BehaviorsArgument as _BehaviorsArgument,
38 CharsetResult as _CharsetResult,
39 MimetypeResult as _MimetypeResult,
40)
43def infer_charset( # noqa: PLR0913
44 content: _nomina.Content, /, *,
45 behaviors: _BehaviorsArgument = _BEHAVIORS_DEFAULT,
46 charset_default: _nomina.CharsetDefaultArgument = _CHARSET_DEFAULT,
47 http_content_type: _nomina.HttpContentTypeArgument = __.absent,
48 charset_supplement: _nomina.CharsetSupplementArgument = __.absent,
49 mimetype_supplement: _nomina.MimetypeSupplementArgument = __.absent,
50 location: _nomina.LocationArgument = __.absent,
51) -> __.typx.Optional[ str ]:
52 ''' Infers charset through various means. '''
53 result = infer_charset_confidence(
54 content,
55 behaviors = behaviors,
56 charset_default = charset_default,
57 http_content_type = http_content_type,
58 charset_supplement = charset_supplement,
59 mimetype_supplement = mimetype_supplement,
60 location = location )
61 return result.charset
64def infer_charset_confidence( # noqa: PLR0913
65 content: _nomina.Content, /, *,
66 behaviors: _BehaviorsArgument = _BEHAVIORS_DEFAULT,
67 charset_default: _nomina.CharsetDefaultArgument = _CHARSET_DEFAULT,
68 http_content_type: _nomina.HttpContentTypeArgument = __.absent,
69 charset_supplement: _nomina.CharsetSupplementArgument = __.absent,
70 mimetype_supplement: _nomina.MimetypeSupplementArgument = __.absent,
71 location: _nomina.LocationArgument = __.absent,
72) -> _CharsetResult:
73 ''' Infers charset with confidence level through various means. '''
74 if content == b'':
75 return _CharsetResult( charset = 'utf-8', confidence = 1.0 )
76 should_parse, should_detect = (
77 _determine_parse_detect( behaviors.charset_detect ) )
78 result = __.absent
79 mimetype = mimetype_supplement
80 http_content_type = (
81 '' if __.is_absent( http_content_type ) else http_content_type )
82 if should_parse and http_content_type:
83 mimetype_result, charset_result = _validate_http_content_type(
84 content, behaviors, http_content_type,
85 charset_supplement = charset_supplement, location = location )
86 if not __.is_absent( mimetype_result ):
87 mimetype = mimetype_result.mimetype
88 if ( not __.is_absent( charset_result )
89 and charset_result.charset is not None
90 ): return charset_result
91 if __.is_absent( result ) and should_detect:
92 result = _detectors.detect_charset_confidence(
93 content, default = charset_default, mimetype = mimetype )
94 if __.is_absent( result ):
95 raise _exceptions.CharsetInferFailure( location = location )
96 return result
99def infer_mimetype_charset( # noqa: PLR0913
100 content: _nomina.Content, /, *,
101 behaviors: _BehaviorsArgument = _BEHAVIORS_DEFAULT,
102 charset_default: _nomina.CharsetDefaultArgument = _CHARSET_DEFAULT,
103 mimetype_default: _nomina.MimetypeDefaultArgument = _MIMETYPE_DEFAULT,
104 http_content_type: _nomina.HttpContentTypeArgument = __.absent,
105 location: _nomina.LocationArgument = __.absent,
106 charset_supplement: _nomina.CharsetSupplementArgument = __.absent,
107 mimetype_supplement: _nomina.MimetypeSupplementArgument = __.absent,
108) -> tuple[ str, __.typx.Optional[ str ] ]:
109 ''' Infers MIME type and charset through various means. '''
110 mimetype_result, charset_result = (
111 infer_mimetype_charset_confidence(
112 content,
113 behaviors = behaviors,
114 charset_default = charset_default,
115 mimetype_default = mimetype_default,
116 http_content_type = http_content_type,
117 location = location,
118 charset_supplement = charset_supplement,
119 mimetype_supplement = mimetype_supplement ) )
120 return mimetype_result.mimetype , charset_result.charset
123def infer_mimetype_charset_confidence( # noqa: PLR0913
124 content: _nomina.Content, /, *,
125 behaviors: _BehaviorsArgument = _BEHAVIORS_DEFAULT,
126 charset_default: _nomina.CharsetDefaultArgument = _CHARSET_DEFAULT,
127 mimetype_default: _nomina.MimetypeDefaultArgument = _MIMETYPE_DEFAULT,
128 http_content_type: _nomina.HttpContentTypeArgument = __.absent,
129 location: _nomina.LocationArgument = __.absent,
130 charset_supplement: _nomina.CharsetSupplementArgument = __.absent,
131 mimetype_supplement: _nomina.MimetypeSupplementArgument = __.absent,
132) -> tuple[ _MimetypeResult, _CharsetResult ]:
133 ''' Infers MIME type and charset through various means. '''
134 should_parse, should_detect_charset = (
135 _determine_parse_detect( behaviors.charset_detect ) )
136 should_parse, should_detect_mimetype = (
137 _determine_parse_detect(
138 behaviors.mimetype_detect, should_parse = should_parse ) )
139 charset_result: __.Absential[ _CharsetResult ] = __.absent
140 mimetype_result: __.Absential[ _MimetypeResult ] = __.absent
141 http_content_type = (
142 '' if __.is_absent( http_content_type ) else http_content_type )
143 if should_parse:
144 if http_content_type:
145 mimetype_result, charset_result = _validate_http_content_type(
146 content, behaviors, http_content_type,
147 charset_supplement = charset_supplement, location = location )
148 if __.is_absent( mimetype_result ) and not __.is_absent( location ):
149 mimetype = _mimetypes.mimetype_from_location( location )
150 if not __.is_absent( mimetype ):
151 mimetype_result = _MimetypeResult(
152 mimetype = mimetype, confidence = 0.9 )
153 if __.is_absent( mimetype_result ) and should_detect_mimetype:
154 charset = (
155 charset_supplement
156 if __.is_absent( charset_result ) or charset_result.charset is None
157 else charset_result.charset )
158 mimetype_result = _detectors.detect_mimetype_confidence(
159 content,
160 behaviors = behaviors,
161 default = mimetype_default,
162 charset = charset,
163 location = location )
164 if __.is_absent( charset_result ) and should_detect_charset:
165 mimetype = (
166 mimetype_supplement if __.is_absent( mimetype_result )
167 else mimetype_result.mimetype )
168 charset_result = _detectors.detect_charset_confidence(
169 content,
170 behaviors = behaviors,
171 default = charset_default,
172 mimetype = mimetype,
173 location = location )
174 if __.is_absent( charset_result ):
175 raise _exceptions.CharsetInferFailure( location = location )
176 if __.is_absent( mimetype_result ):
177 raise _exceptions.MimetypeInferFailure( location = location )
178 return mimetype_result, charset_result
181def parse_http_content_type(
182 http_content_type: str
183) -> tuple[ __.Absential[ str ], __.Absential[ __.typx.Optional[ str ] ] ]:
184 ''' Parses RFC 9110 HTTP Content-Type header.
186 Returns normalized MIME type and charset, if able to be extracted.
187 Marks either as absent, if not able to be extracted.
188 '''
189 mimetype, *params = http_content_type.split( ';' )
190 if mimetype:
191 mimetype = mimetype.strip( ).lower( )
192 if _mimetypes.is_textual_mimetype( mimetype ):
193 for param in params:
194 name, value = param.split( '=' )
195 if 'charset' == name.strip( ).lower( ):
196 return mimetype, value.strip( ).lower( )
197 return mimetype, __.absent
198 return mimetype, None # non-textual type, charset irrelevant
199 return __.absent, __.absent
202def _determine_parse_detect(
203 detect_tristate: _BehaviorTristate, should_parse = False
204) -> tuple[ bool, bool ]:
205 match detect_tristate:
206 case _BehaviorTristate.Always:
207 should_parse = should_parse or False
208 should_detect = True
209 case _BehaviorTristate.AsNeeded:
210 should_parse = should_parse or True
211 should_detect = True
212 case _BehaviorTristate.Never: # pragma: no branch
213 should_parse = should_parse or True
214 should_detect = False
215 return should_parse, should_detect
218def _validate_http_content_type(
219 content: _nomina.Content,
220 behaviors: _Behaviors,
221 http_content_type: str, /, *,
222 charset_supplement: __.Absential[ str ] = __.absent,
223 location: __.Absential[ _nomina.Location ] = __.absent,
224) -> tuple[ __.Absential[ _MimetypeResult ], __.Absential[ _CharsetResult ] ]:
225 mimetype, charset = parse_http_content_type( http_content_type )
226 if __.is_absent( charset ):
227 charset_result = __.absent
228 elif charset is None:
229 charset_result = _CharsetResult( charset = None, confidence = 0.9 )
230 else:
231 charset_result = _charsets.trial_decode_as_confident(
232 content,
233 behaviors = behaviors,
234 inference = charset,
235 supplement = charset_supplement )
236 if __.is_absent( mimetype ): mimetype_result = __.absent
237 else:
238 mimetype_result = _MimetypeResult(
239 mimetype = mimetype, confidence = 0.9 )
240 return mimetype_result, charset_result