Coverage for sources/detextive/inference.py: 100%

88 statements  

« prev     ^ index     » next       coverage.py v7.10.6, created at 2025-09-20 18:02 +0000

1# vim: set filetype=python fileencoding=utf-8: 

2# -*- coding: utf-8 -*- 

3 

4#============================================================================# 

5# # 

6# Licensed under the Apache License, Version 2.0 (the "License"); # 

7# you may not use this file except in compliance with the License. # 

8# You may obtain a copy of the License at # 

9# # 

10# http://www.apache.org/licenses/LICENSE-2.0 # 

11# # 

12# Unless required by applicable law or agreed to in writing, software # 

13# distributed under the License is distributed on an "AS IS" BASIS, # 

14# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # 

15# See the License for the specific language governing permissions and # 

16# limitations under the License. # 

17# # 

18#============================================================================# 

19 

20 

21''' Core detection function implementations. ''' 

22 

23 

24from . import __ 

25from . import charsets as _charsets 

26from . import detectors as _detectors 

27from . import exceptions as _exceptions 

28from . import mimetypes as _mimetypes 

29from . import nomina as _nomina 

30 

31from .core import ( # isort: skip 

32 BEHAVIORS_DEFAULT as _BEHAVIORS_DEFAULT, 

33 CHARSET_DEFAULT as _CHARSET_DEFAULT, 

34 MIMETYPE_DEFAULT as _MIMETYPE_DEFAULT, 

35 BehaviorTristate as _BehaviorTristate, 

36 Behaviors as _Behaviors, 

37 BehaviorsArgument as _BehaviorsArgument, 

38 CharsetResult as _CharsetResult, 

39 MimetypeResult as _MimetypeResult, 

40) 

41 

42 

43def infer_charset( # noqa: PLR0913 

44 content: _nomina.Content, /, *, 

45 behaviors: _BehaviorsArgument = _BEHAVIORS_DEFAULT, 

46 charset_default: _nomina.CharsetDefaultArgument = _CHARSET_DEFAULT, 

47 http_content_type: _nomina.HttpContentTypeArgument = __.absent, 

48 charset_supplement: _nomina.CharsetSupplementArgument = __.absent, 

49 mimetype_supplement: _nomina.MimetypeSupplementArgument = __.absent, 

50 location: _nomina.LocationArgument = __.absent, 

51) -> __.typx.Optional[ str ]: 

52 ''' Infers charset through various means. ''' 

53 result = infer_charset_confidence( 

54 content, 

55 behaviors = behaviors, 

56 charset_default = charset_default, 

57 http_content_type = http_content_type, 

58 charset_supplement = charset_supplement, 

59 mimetype_supplement = mimetype_supplement, 

60 location = location ) 

61 return result.charset 

62 

63 

64def infer_charset_confidence( # noqa: PLR0913 

65 content: _nomina.Content, /, *, 

66 behaviors: _BehaviorsArgument = _BEHAVIORS_DEFAULT, 

67 charset_default: _nomina.CharsetDefaultArgument = _CHARSET_DEFAULT, 

68 http_content_type: _nomina.HttpContentTypeArgument = __.absent, 

69 charset_supplement: _nomina.CharsetSupplementArgument = __.absent, 

70 mimetype_supplement: _nomina.MimetypeSupplementArgument = __.absent, 

71 location: _nomina.LocationArgument = __.absent, 

72) -> _CharsetResult: 

73 ''' Infers charset with confidence level through various means. ''' 

74 if content == b'': 

75 return _CharsetResult( charset = 'utf-8', confidence = 1.0 ) 

76 should_parse, should_detect = ( 

77 _determine_parse_detect( behaviors.charset_detect ) ) 

78 result = __.absent 

79 mimetype = mimetype_supplement 

80 http_content_type = ( 

81 '' if __.is_absent( http_content_type ) else http_content_type ) 

82 if should_parse and http_content_type: 

83 mimetype_result, charset_result = _validate_http_content_type( 

84 content, behaviors, http_content_type, 

85 charset_supplement = charset_supplement, location = location ) 

86 if not __.is_absent( mimetype_result ): 

87 mimetype = mimetype_result.mimetype 

88 if ( not __.is_absent( charset_result ) 

89 and charset_result.charset is not None 

90 ): return charset_result 

91 if __.is_absent( result ) and should_detect: 

92 result = _detectors.detect_charset_confidence( 

93 content, default = charset_default, mimetype = mimetype ) 

94 if __.is_absent( result ): 

95 raise _exceptions.CharsetInferFailure( location = location ) 

96 return result 

97 

98 

99def infer_mimetype_charset( # noqa: PLR0913 

100 content: _nomina.Content, /, *, 

101 behaviors: _BehaviorsArgument = _BEHAVIORS_DEFAULT, 

102 charset_default: _nomina.CharsetDefaultArgument = _CHARSET_DEFAULT, 

103 mimetype_default: _nomina.MimetypeDefaultArgument = _MIMETYPE_DEFAULT, 

104 http_content_type: _nomina.HttpContentTypeArgument = __.absent, 

105 location: _nomina.LocationArgument = __.absent, 

106 charset_supplement: _nomina.CharsetSupplementArgument = __.absent, 

107 mimetype_supplement: _nomina.MimetypeSupplementArgument = __.absent, 

108) -> tuple[ str, __.typx.Optional[ str ] ]: 

109 ''' Infers MIME type and charset through various means. ''' 

110 mimetype_result, charset_result = ( 

111 infer_mimetype_charset_confidence( 

112 content, 

113 behaviors = behaviors, 

114 charset_default = charset_default, 

115 mimetype_default = mimetype_default, 

116 http_content_type = http_content_type, 

117 location = location, 

118 charset_supplement = charset_supplement, 

119 mimetype_supplement = mimetype_supplement ) ) 

120 return mimetype_result.mimetype , charset_result.charset 

121 

122 

123def infer_mimetype_charset_confidence( # noqa: PLR0913 

124 content: _nomina.Content, /, *, 

125 behaviors: _BehaviorsArgument = _BEHAVIORS_DEFAULT, 

126 charset_default: _nomina.CharsetDefaultArgument = _CHARSET_DEFAULT, 

127 mimetype_default: _nomina.MimetypeDefaultArgument = _MIMETYPE_DEFAULT, 

128 http_content_type: _nomina.HttpContentTypeArgument = __.absent, 

129 location: _nomina.LocationArgument = __.absent, 

130 charset_supplement: _nomina.CharsetSupplementArgument = __.absent, 

131 mimetype_supplement: _nomina.MimetypeSupplementArgument = __.absent, 

132) -> tuple[ _MimetypeResult, _CharsetResult ]: 

133 ''' Infers MIME type and charset through various means. ''' 

134 should_parse, should_detect_charset = ( 

135 _determine_parse_detect( behaviors.charset_detect ) ) 

136 should_parse, should_detect_mimetype = ( 

137 _determine_parse_detect( 

138 behaviors.mimetype_detect, should_parse = should_parse ) ) 

139 charset_result: __.Absential[ _CharsetResult ] = __.absent 

140 mimetype_result: __.Absential[ _MimetypeResult ] = __.absent 

141 http_content_type = ( 

142 '' if __.is_absent( http_content_type ) else http_content_type ) 

143 if should_parse: 

144 if http_content_type: 

145 mimetype_result, charset_result = _validate_http_content_type( 

146 content, behaviors, http_content_type, 

147 charset_supplement = charset_supplement, location = location ) 

148 if __.is_absent( mimetype_result ) and not __.is_absent( location ): 

149 mimetype = _mimetypes.mimetype_from_location( location ) 

150 if not __.is_absent( mimetype ): 

151 mimetype_result = _MimetypeResult( 

152 mimetype = mimetype, confidence = 0.9 ) 

153 if __.is_absent( mimetype_result ) and should_detect_mimetype: 

154 charset = ( 

155 charset_supplement 

156 if __.is_absent( charset_result ) or charset_result.charset is None 

157 else charset_result.charset ) 

158 mimetype_result = _detectors.detect_mimetype_confidence( 

159 content, 

160 behaviors = behaviors, 

161 default = mimetype_default, 

162 charset = charset, 

163 location = location ) 

164 if __.is_absent( charset_result ) and should_detect_charset: 

165 mimetype = ( 

166 mimetype_supplement if __.is_absent( mimetype_result ) 

167 else mimetype_result.mimetype ) 

168 charset_result = _detectors.detect_charset_confidence( 

169 content, 

170 behaviors = behaviors, 

171 default = charset_default, 

172 mimetype = mimetype, 

173 location = location ) 

174 if __.is_absent( charset_result ): 

175 raise _exceptions.CharsetInferFailure( location = location ) 

176 if __.is_absent( mimetype_result ): 

177 raise _exceptions.MimetypeInferFailure( location = location ) 

178 return mimetype_result, charset_result 

179 

180 

181def parse_http_content_type( 

182 http_content_type: str 

183) -> tuple[ __.Absential[ str ], __.Absential[ __.typx.Optional[ str ] ] ]: 

184 ''' Parses RFC 9110 HTTP Content-Type header. 

185 

186 Returns normalized MIME type and charset, if able to be extracted. 

187 Marks either as absent, if not able to be extracted. 

188 ''' 

189 mimetype, *params = http_content_type.split( ';' ) 

190 if mimetype: 

191 mimetype = mimetype.strip( ).lower( ) 

192 if _mimetypes.is_textual_mimetype( mimetype ): 

193 for param in params: 

194 name, value = param.split( '=' ) 

195 if 'charset' == name.strip( ).lower( ): 

196 return mimetype, value.strip( ).lower( ) 

197 return mimetype, __.absent 

198 return mimetype, None # non-textual type, charset irrelevant 

199 return __.absent, __.absent 

200 

201 

202def _determine_parse_detect( 

203 detect_tristate: _BehaviorTristate, should_parse = False 

204) -> tuple[ bool, bool ]: 

205 match detect_tristate: 

206 case _BehaviorTristate.Always: 

207 should_parse = should_parse or False 

208 should_detect = True 

209 case _BehaviorTristate.AsNeeded: 

210 should_parse = should_parse or True 

211 should_detect = True 

212 case _BehaviorTristate.Never: # pragma: no branch 

213 should_parse = should_parse or True 

214 should_detect = False 

215 return should_parse, should_detect 

216 

217 

218def _validate_http_content_type( 

219 content: _nomina.Content, 

220 behaviors: _Behaviors, 

221 http_content_type: str, /, *, 

222 charset_supplement: __.Absential[ str ] = __.absent, 

223 location: __.Absential[ _nomina.Location ] = __.absent, 

224) -> tuple[ __.Absential[ _MimetypeResult ], __.Absential[ _CharsetResult ] ]: 

225 mimetype, charset = parse_http_content_type( http_content_type ) 

226 if __.is_absent( charset ): 

227 charset_result = __.absent 

228 elif charset is None: 

229 charset_result = _CharsetResult( charset = None, confidence = 0.9 ) 

230 else: 

231 charset_result = _charsets.trial_decode_as_confident( 

232 content, 

233 behaviors = behaviors, 

234 inference = charset, 

235 supplement = charset_supplement ) 

236 if __.is_absent( mimetype ): mimetype_result = __.absent 

237 else: 

238 mimetype_result = _MimetypeResult( 

239 mimetype = mimetype, confidence = 0.9 ) 

240 return mimetype_result, charset_result