Coverage for sources/detextive/inference.py: 100%

1# vim: set filetype=python fileencoding=utf-8:

2# -*- coding: utf-8 -*-

4#============================================================================#

5# #

6# Licensed under the Apache License, Version 2.0 (the "License"); #

7# you may not use this file except in compliance with the License. #

8# You may obtain a copy of the License at #

9# #

10# http://www.apache.org/licenses/LICENSE-2.0 #

11# #

12# Unless required by applicable law or agreed to in writing, software #

13# distributed under the License is distributed on an "AS IS" BASIS, #

14# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. #

15# See the License for the specific language governing permissions and #

16# limitations under the License. #

17# #

18#============================================================================#

21''' Core detection function implementations. '''

24from . import __

25from . import charsets as _charsets

26from . import detectors as _detectors

27from . import exceptions as _exceptions

28from . import mimetypes as _mimetypes

29from . import nomina as _nomina

31from .core import ( # isort: skip

32 BEHAVIORS_DEFAULT as _BEHAVIORS_DEFAULT,

33 CHARSET_DEFAULT as _CHARSET_DEFAULT,

34 MIMETYPE_DEFAULT as _MIMETYPE_DEFAULT,

35 BehaviorTristate as _BehaviorTristate,

36 Behaviors as _Behaviors,

37 BehaviorsArgument as _BehaviorsArgument,

38 CharsetResult as _CharsetResult,

39 MimetypeResult as _MimetypeResult,

40)

43def infer_charset( # noqa: PLR0913

44 content: _nomina.Content, /, *,

45 behaviors: _BehaviorsArgument = _BEHAVIORS_DEFAULT,

46 charset_default: _nomina.CharsetDefaultArgument = _CHARSET_DEFAULT,

47 http_content_type: _nomina.HttpContentTypeArgument = __.absent,

48 charset_supplement: _nomina.CharsetSupplementArgument = __.absent,

49 mimetype_supplement: _nomina.MimetypeSupplementArgument = __.absent,

50 location: _nomina.LocationArgument = __.absent,

51) -> __.typx.Optional[ str ]:

52 ''' Infers charset through various means. '''

53 result = infer_charset_confidence(

54 content,

55 behaviors = behaviors,

56 charset_default = charset_default,

57 http_content_type = http_content_type,

58 charset_supplement = charset_supplement,

59 mimetype_supplement = mimetype_supplement,

60 location = location )

61 return result.charset

64def infer_charset_confidence( # noqa: PLR0913

65 content: _nomina.Content, /, *,

66 behaviors: _BehaviorsArgument = _BEHAVIORS_DEFAULT,

67 charset_default: _nomina.CharsetDefaultArgument = _CHARSET_DEFAULT,

68 http_content_type: _nomina.HttpContentTypeArgument = __.absent,

69 charset_supplement: _nomina.CharsetSupplementArgument = __.absent,

70 mimetype_supplement: _nomina.MimetypeSupplementArgument = __.absent,

71 location: _nomina.LocationArgument = __.absent,

72) -> _CharsetResult:

73 ''' Infers charset with confidence level through various means. '''

74 if content == b'':

75 return _CharsetResult( charset = 'utf-8', confidence = 1.0 )

76 should_parse, should_detect = (

77 _determine_parse_detect( behaviors.charset_detect ) )

78 result = __.absent

79 mimetype = mimetype_supplement

80 http_content_type = (

81 '' if __.is_absent( http_content_type ) else http_content_type )

82 if should_parse and http_content_type:

83 mimetype_result, charset_result = _validate_http_content_type(

84 content, behaviors, http_content_type,

85 charset_supplement = charset_supplement, location = location )

86 if not __.is_absent( mimetype_result ):

87 mimetype = mimetype_result.mimetype

88 if ( not __.is_absent( charset_result )

89 and charset_result.charset is not None

90 ): return charset_result

91 if __.is_absent( result ) and should_detect:

92 result = _detectors.detect_charset_confidence(

93 content, default = charset_default, mimetype = mimetype )

94 if __.is_absent( result ):

95 raise _exceptions.CharsetInferFailure( location = location )

96 return result

99def infer_mimetype_charset( # noqa: PLR0913

100 content: _nomina.Content, /, *,

101 behaviors: _BehaviorsArgument = _BEHAVIORS_DEFAULT,

102 charset_default: _nomina.CharsetDefaultArgument = _CHARSET_DEFAULT,

103 mimetype_default: _nomina.MimetypeDefaultArgument = _MIMETYPE_DEFAULT,

104 http_content_type: _nomina.HttpContentTypeArgument = __.absent,

105 location: _nomina.LocationArgument = __.absent,

106 charset_supplement: _nomina.CharsetSupplementArgument = __.absent,

107 mimetype_supplement: _nomina.MimetypeSupplementArgument = __.absent,

108) -> tuple[ str, __.typx.Optional[ str ] ]:

109 ''' Infers MIME type and charset through various means. '''

110 mimetype_result, charset_result = (

111 infer_mimetype_charset_confidence(

112 content,

113 behaviors = behaviors,

114 charset_default = charset_default,

115 mimetype_default = mimetype_default,

116 http_content_type = http_content_type,

117 location = location,

118 charset_supplement = charset_supplement,

119 mimetype_supplement = mimetype_supplement ) )

120 return mimetype_result.mimetype , charset_result.charset

121

122

123def infer_mimetype_charset_confidence( # noqa: PLR0913

124 content: _nomina.Content, /, *,

125 behaviors: _BehaviorsArgument = _BEHAVIORS_DEFAULT,

126 charset_default: _nomina.CharsetDefaultArgument = _CHARSET_DEFAULT,

127 mimetype_default: _nomina.MimetypeDefaultArgument = _MIMETYPE_DEFAULT,

128 http_content_type: _nomina.HttpContentTypeArgument = __.absent,

129 location: _nomina.LocationArgument = __.absent,

130 charset_supplement: _nomina.CharsetSupplementArgument = __.absent,

131 mimetype_supplement: _nomina.MimetypeSupplementArgument = __.absent,

132) -> tuple[ _MimetypeResult, _CharsetResult ]:

133 ''' Infers MIME type and charset through various means. '''

134 should_parse, should_detect_charset = (

135 _determine_parse_detect( behaviors.charset_detect ) )

136 should_parse, should_detect_mimetype = (

137 _determine_parse_detect(

138 behaviors.mimetype_detect, should_parse = should_parse ) )

139 charset_result: __.Absential[ _CharsetResult ] = __.absent

140 mimetype_result: __.Absential[ _MimetypeResult ] = __.absent

141 http_content_type = (

142 '' if __.is_absent( http_content_type ) else http_content_type )

143 if should_parse:

144 if http_content_type:

145 mimetype_result, charset_result = _validate_http_content_type(

146 content, behaviors, http_content_type,

147 charset_supplement = charset_supplement, location = location )

148 if __.is_absent( mimetype_result ) and not __.is_absent( location ):

149 mimetype = _mimetypes.mimetype_from_location( location )

150 if not __.is_absent( mimetype ):

151 mimetype_result = _MimetypeResult(

152 mimetype = mimetype, confidence = 0.9 )

153 if __.is_absent( mimetype_result ) and should_detect_mimetype:

154 charset = (

155 charset_supplement

156 if __.is_absent( charset_result ) or charset_result.charset is None

157 else charset_result.charset )

158 mimetype_result = _detectors.detect_mimetype_confidence(

159 content,

160 behaviors = behaviors,

161 default = mimetype_default,

162 charset = charset,

163 location = location )

164 if __.is_absent( charset_result ) and should_detect_charset:

165 mimetype = (

166 mimetype_supplement if __.is_absent( mimetype_result )

167 else mimetype_result.mimetype )

168 charset_result = _detectors.detect_charset_confidence(

169 content,

170 behaviors = behaviors,

171 default = charset_default,

172 mimetype = mimetype,

173 location = location )

174 if __.is_absent( charset_result ):

175 raise _exceptions.CharsetInferFailure( location = location )

176 if __.is_absent( mimetype_result ):

177 raise _exceptions.MimetypeInferFailure( location = location )

178 return mimetype_result, charset_result

179

180

181def parse_http_content_type(

182 http_content_type: str

183) -> tuple[ __.Absential[ str ], __.Absential[ __.typx.Optional[ str ] ] ]:

184 ''' Parses RFC 9110 HTTP Content-Type header.

185

186 Returns normalized MIME type and charset, if able to be extracted.

187 Marks either as absent, if not able to be extracted.

188 '''

189 mimetype, *params = http_content_type.split( ';' )

190 if mimetype:

191 mimetype = mimetype.strip( ).lower( )

192 if _mimetypes.is_textual_mimetype( mimetype ):

193 for param in params:

194 name, value = param.split( '=' )

195 if 'charset' == name.strip( ).lower( ):

196 return mimetype, value.strip( ).lower( )

197 return mimetype, __.absent

198 return mimetype, None # non-textual type, charset irrelevant

199 return __.absent, __.absent

200

201

202def _determine_parse_detect(

203 detect_tristate: _BehaviorTristate, should_parse = False

204) -> tuple[ bool, bool ]:

205 match detect_tristate:

206 case _BehaviorTristate.Always:

207 should_parse = should_parse or False

208 should_detect = True

209 case _BehaviorTristate.AsNeeded:

210 should_parse = should_parse or True

211 should_detect = True

212 case _BehaviorTristate.Never: # pragma: no branch

213 should_parse = should_parse or True

214 should_detect = False

215 return should_parse, should_detect

216

217

218def _validate_http_content_type(

219 content: _nomina.Content,

220 behaviors: _Behaviors,

221 http_content_type: str, /, *,

222 charset_supplement: __.Absential[ str ] = __.absent,

223 location: __.Absential[ _nomina.Location ] = __.absent,

224) -> tuple[ __.Absential[ _MimetypeResult ], __.Absential[ _CharsetResult ] ]:

225 mimetype, charset = parse_http_content_type( http_content_type )

226 if __.is_absent( charset ):

227 charset_result = __.absent

228 elif charset is None:

229 charset_result = _CharsetResult( charset = None, confidence = 0.9 )

230 else:

231 charset_result = _charsets.trial_decode_as_confident(

232 content,

233 behaviors = behaviors,

234 inference = charset,

235 supplement = charset_supplement )

236 if __.is_absent( mimetype ): mimetype_result = __.absent

237 else:

238 mimetype_result = _MimetypeResult(

239 mimetype = mimetype, confidence = 0.9 )

240 return mimetype_result, charset_result