Coverage for sources / detextive / inference.py: 100%

81 statements  

« prev     ^ index     » next       coverage.py v7.13.4, created at 2026-02-17 06:15 +0000

1# vim: set filetype=python fileencoding=utf-8: 

2# -*- coding: utf-8 -*- 

3 

4#============================================================================# 

5# # 

6# Licensed under the Apache License, Version 2.0 (the "License"); # 

7# you may not use this file except in compliance with the License. # 

8# You may obtain a copy of the License at # 

9# # 

10# http://www.apache.org/licenses/LICENSE-2.0 # 

11# # 

12# Unless required by applicable law or agreed to in writing, software # 

13# distributed under the License is distributed on an "AS IS" BASIS, # 

14# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # 

15# See the License for the specific language governing permissions and # 

16# limitations under the License. # 

17# # 

18#============================================================================# 

19 

20 

21''' Core detection function implementations. ''' 

22 

23 

24from . import __ 

25from . import charsets as _charsets 

26from . import detectors as _detectors 

27from . import exceptions as _exceptions 

28from . import mimetypes as _mimetypes 

29from . import nomina as _nomina 

30 

31from .core import ( # isort: skip 

32 BEHAVIORS_DEFAULT as _BEHAVIORS_DEFAULT, 

33 CHARSET_DEFAULT as _CHARSET_DEFAULT, 

34 MIMETYPE_DEFAULT as _MIMETYPE_DEFAULT, 

35 Behaviors as _Behaviors, 

36 BehaviorsArgument as _BehaviorsArgument, 

37 CharsetResult as _CharsetResult, 

38 CodecSpecifiers as _CodecSpecifiers, 

39 MimetypeResult as _MimetypeResult, 

40) 

41 

42 

43def infer_charset( # noqa: PLR0913 

44 content: _nomina.Content, /, *, 

45 behaviors: _BehaviorsArgument = _BEHAVIORS_DEFAULT, 

46 charset_default: _nomina.CharsetDefaultArgument = _CHARSET_DEFAULT, 

47 http_content_type: _nomina.HttpContentTypeArgument = __.absent, 

48 charset_supplement: _nomina.CharsetSupplementArgument = __.absent, 

49 mimetype_supplement: _nomina.MimetypeSupplementArgument = __.absent, 

50 location: _nomina.LocationArgument = __.absent, 

51) -> __.typx.Optional[ str ]: 

52 ''' Infers charset through various means. 

53 

54 ``charset_default`` is the returned fallback when inference cannot 

55 determine another charset. ``charset_supplement`` is a user-supplied 

56 hint used during inference/validation. 

57 ''' 

58 result = infer_charset_confidence( 

59 content, 

60 behaviors = behaviors, 

61 charset_default = charset_default, 

62 http_content_type = http_content_type, 

63 charset_supplement = charset_supplement, 

64 mimetype_supplement = mimetype_supplement, 

65 location = location ) 

66 return result.charset 

67 

68 

69def infer_charset_confidence( # noqa: PLR0913 

70 content: _nomina.Content, /, *, 

71 behaviors: _BehaviorsArgument = _BEHAVIORS_DEFAULT, 

72 charset_default: _nomina.CharsetDefaultArgument = _CHARSET_DEFAULT, 

73 http_content_type: _nomina.HttpContentTypeArgument = __.absent, 

74 charset_supplement: _nomina.CharsetSupplementArgument = __.absent, 

75 mimetype_supplement: _nomina.MimetypeSupplementArgument = __.absent, 

76 location: _nomina.LocationArgument = __.absent, 

77) -> _CharsetResult: 

78 ''' Infers charset with confidence level through various means. 

79 

80 ``charset_default`` is the returned fallback when inference cannot 

81 determine another charset. ``charset_supplement`` is a user-supplied 

82 hint used during inference/validation. ``http_content_type`` is 

83 parsed when supplied, independent of detector enablement behavior. 

84 ''' 

85 if content == b'': 

86 return _CharsetResult( charset = 'utf-8', confidence = 1.0 ) 

87 should_detect = behaviors.charset_detect 

88 result = __.absent 

89 mimetype = mimetype_supplement 

90 http_content_type = ( 

91 '' if __.is_absent( http_content_type ) else http_content_type ) 

92 if http_content_type: 

93 mimetype_result, charset_result = _validate_http_content_type( 

94 content, behaviors, http_content_type, 

95 charset_supplement = charset_supplement, location = location ) 

96 if not __.is_absent( mimetype_result ): 

97 mimetype = mimetype_result.mimetype 

98 if ( not __.is_absent( charset_result ) 

99 and charset_result.charset is not None 

100 ): return charset_result 

101 if __.is_absent( result ) and should_detect: 

102 result = _detectors.detect_charset_confidence( 

103 content, default = charset_default, mimetype = mimetype ) 

104 if __.is_absent( result ): 

105 raise _exceptions.CharsetInferFailure( location = location ) 

106 return result 

107 

108 

109def infer_mimetype_charset( # noqa: PLR0913 

110 content: _nomina.Content, /, *, 

111 behaviors: _BehaviorsArgument = _BEHAVIORS_DEFAULT, 

112 charset_default: _nomina.CharsetDefaultArgument = _CHARSET_DEFAULT, 

113 mimetype_default: _nomina.MimetypeDefaultArgument = _MIMETYPE_DEFAULT, 

114 http_content_type: _nomina.HttpContentTypeArgument = __.absent, 

115 location: _nomina.LocationArgument = __.absent, 

116 charset_supplement: _nomina.CharsetSupplementArgument = __.absent, 

117 mimetype_supplement: _nomina.MimetypeSupplementArgument = __.absent, 

118) -> tuple[ str, __.typx.Optional[ str ] ]: 

119 ''' Infers MIME type and charset through various means. 

120 

121 ``*_default`` values are returned fallbacks on inference failure. 

122 ``*_supplement`` values are user-supplied hints used to guide 

123 inference before fallback behavior is applied. 

124 ''' 

125 mimetype_result, charset_result = ( 

126 infer_mimetype_charset_confidence( 

127 content, 

128 behaviors = behaviors, 

129 charset_default = charset_default, 

130 mimetype_default = mimetype_default, 

131 http_content_type = http_content_type, 

132 location = location, 

133 charset_supplement = charset_supplement, 

134 mimetype_supplement = mimetype_supplement ) ) 

135 return mimetype_result.mimetype , charset_result.charset 

136 

137 

138def infer_mimetype_charset_confidence( # noqa: PLR0913 

139 content: _nomina.Content, /, *, 

140 behaviors: _BehaviorsArgument = _BEHAVIORS_DEFAULT, 

141 charset_default: _nomina.CharsetDefaultArgument = _CHARSET_DEFAULT, 

142 mimetype_default: _nomina.MimetypeDefaultArgument = _MIMETYPE_DEFAULT, 

143 http_content_type: _nomina.HttpContentTypeArgument = __.absent, 

144 location: _nomina.LocationArgument = __.absent, 

145 charset_supplement: _nomina.CharsetSupplementArgument = __.absent, 

146 mimetype_supplement: _nomina.MimetypeSupplementArgument = __.absent, 

147) -> tuple[ _MimetypeResult, _CharsetResult ]: 

148 ''' Infers MIME type and charset through various means. ''' 

149 should_detect_charset = behaviors.charset_detect 

150 should_detect_mimetype = behaviors.mimetype_detect 

151 charset_result: __.Absential[ _CharsetResult ] = __.absent 

152 mimetype_result: __.Absential[ _MimetypeResult ] = __.absent 

153 http_content_type = ( 

154 '' if __.is_absent( http_content_type ) else http_content_type ) 

155 if http_content_type: 

156 mimetype_result, charset_result = _validate_http_content_type( 

157 content, behaviors, http_content_type, 

158 charset_supplement = charset_supplement, location = location ) 

159 if __.is_absent( mimetype_result ) and not __.is_absent( location ): 

160 mimetype = _mimetypes.mimetype_from_location( location ) 

161 if not __.is_absent( mimetype ): 

162 mimetype_result = _MimetypeResult( 

163 mimetype = mimetype, confidence = 0.9 ) 

164 if __.is_absent( mimetype_result ) and should_detect_mimetype: 

165 charset = ( 

166 charset_supplement 

167 if __.is_absent( charset_result ) or charset_result.charset is None 

168 else charset_result.charset ) 

169 mimetype_result = _detectors.detect_mimetype_confidence( 

170 content, 

171 behaviors = behaviors, 

172 default = mimetype_default, 

173 charset = charset, 

174 location = location ) 

175 if __.is_absent( charset_result ) and should_detect_charset: 

176 mimetype = ( 

177 mimetype_supplement if __.is_absent( mimetype_result ) 

178 else mimetype_result.mimetype ) 

179 charset_result = _detectors.detect_charset_confidence( 

180 content, 

181 behaviors = behaviors, 

182 default = charset_default, 

183 mimetype = mimetype, 

184 location = location ) 

185 if __.is_absent( charset_result ): 

186 raise _exceptions.CharsetInferFailure( location = location ) 

187 if __.is_absent( mimetype_result ): 

188 raise _exceptions.MimetypeInferFailure( location = location ) 

189 return mimetype_result, charset_result 

190 

191 

192def parse_http_content_type( 

193 http_content_type: str 

194) -> tuple[ __.Absential[ str ], __.Absential[ __.typx.Optional[ str ] ] ]: 

195 ''' Parses RFC 9110 HTTP Content-Type header. 

196 

197 Returns normalized MIME type and charset, if able to be extracted. 

198 Marks either as absent, if not able to be extracted. 

199 ''' 

200 mimetype, *params = http_content_type.split( ';' ) 

201 if mimetype: 

202 mimetype = mimetype.strip( ).lower( ) 

203 if _mimetypes.is_textual_mimetype( mimetype ): 

204 for param in params: 

205 name, separator, value = param.partition( '=' ) 

206 if separator != '=': continue 

207 if 'charset' == name.strip( ).lower( ): 

208 charset = value.strip( ).lower( ) 

209 if charset: return mimetype, charset 

210 return mimetype, __.absent 

211 return mimetype, __.absent 

212 return mimetype, None # non-textual type, charset irrelevant 

213 return __.absent, __.absent 

214 

215 

216def validate_httpct_charset( 

217 content: _nomina.Content, 

218 charset: str, /, *, 

219 behaviors: _Behaviors = _BEHAVIORS_DEFAULT, 

220) -> __.Absential[ _CharsetResult ]: 

221 behaviors_ = __.dcls.replace( 

222 behaviors, trial_codecs = ( _CodecSpecifiers.FromInference, ) ) 

223 return _charsets.trial_decode_as_confident( 

224 content, behaviors = behaviors_, inference = charset ) 

225 

226 

227def _validate_http_content_type( 

228 content: _nomina.Content, 

229 behaviors: _Behaviors, 

230 http_content_type: str, /, *, 

231 charset_supplement: __.Absential[ str ] = __.absent, 

232 location: __.Absential[ _nomina.Location ] = __.absent, 

233) -> tuple[ __.Absential[ _MimetypeResult ], __.Absential[ _CharsetResult ] ]: 

234 mimetype, charset = parse_http_content_type( http_content_type ) 

235 if __.is_absent( charset ): 

236 charset_result = __.absent 

237 elif charset is None: 

238 charset_result = _CharsetResult( charset = None, confidence = 0.9 ) 

239 else: 

240 charset_result = validate_httpct_charset( 

241 content, charset, behaviors = behaviors ) 

242 if __.is_absent( mimetype ): mimetype_result = __.absent 

243 else: 

244 mimetype_result = _MimetypeResult( 

245 mimetype = mimetype, confidence = 0.9 ) 

246 return mimetype_result, charset_result