Coverage for sources / detextive / detectors.py: 100%

126 statements  

« prev     ^ index     » next       coverage.py v7.13.4, created at 2026-02-17 06:15 +0000

1# vim: set filetype=python fileencoding=utf-8: 

2# -*- coding: utf-8 -*- 

3 

4#============================================================================# 

5# # 

6# Licensed under the Apache License, Version 2.0 (the "License"); # 

7# you may not use this file except in compliance with the License. # 

8# You may obtain a copy of the License at # 

9# # 

10# http://www.apache.org/licenses/LICENSE-2.0 # 

11# # 

12# Unless required by applicable law or agreed to in writing, software # 

13# distributed under the License is distributed on an "AS IS" BASIS, # 

14# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # 

15# See the License for the specific language governing permissions and # 

16# limitations under the License. # 

17# # 

18#============================================================================# 

19 

20 

21''' Core detection function implementations. ''' 

22 

23 

24from . import __ 

25from . import charsets as _charsets 

26from . import core as _core 

27from . import exceptions as _exceptions 

28from . import mimetypes as _mimetypes 

29from . import nomina as _nomina 

30from . import validation as _validation 

31 

32from .core import ( # isort: skip 

33 BEHAVIORS_DEFAULT as _BEHAVIORS_DEFAULT, 

34 CHARSET_DEFAULT as _CHARSET_DEFAULT, 

35 MIMETYPE_DEFAULT as _MIMETYPE_DEFAULT, 

36 BehaviorTristate as _BehaviorTristate, 

37 Behaviors as _Behaviors, 

38 BehaviorsArgument as _BehaviorsArgument, 

39 CharsetResult as _CharsetResult, 

40 CodecSpecifiers as _CodecSpecifiers, 

41 DetectFailureActions as _DetectFailureActions, 

42 MimetypeResult as _MimetypeResult, 

43) 

44 

45 

46CharsetDetector: __.typx.TypeAlias = __.typx.Annotated[ 

47 __.cabc.Callable[ 

48 [ _nomina.Content, _Behaviors ], 

49 _CharsetResult | __.types.NotImplementedType 

50 ], 

51 __.ddoc.Doc( 

52 ''' Character set detector function. 

53 

54 Takes bytes content and behaviors object. 

55 

56 Returns either a detection result or ``NotImplemented``. The 

57 detection result will include the name of the character set, which 

58 has been determined as able to decode the content, or ``None``, if 

59 it believes that no character set is applicable to the content, and 

60 the confidence of the detection. 

61 ''' ), 

62] 

63MimetypeDetector: __.typx.TypeAlias = __.typx.Annotated[ 

64 __.cabc.Callable[ 

65 [ _nomina.Content, _Behaviors ], 

66 _MimetypeResult | __.types.NotImplementedType, 

67 ], 

68 __.ddoc.Doc( 

69 ''' MIME type detector function. 

70 

71 Takes bytes content and behaviors object. 

72 

73 Returns either a detection result or ``NotImplemented``. The 

74 detection result will include the MIME type and the confidence of 

75 the detection. 

76 ''' ), 

77] 

78 

79 

80charset_detectors: __.typx.Annotated[ 

81 __.accret.Dictionary[ str, CharsetDetector ], 

82 __.ddoc.Doc( ''' Registry for character set detectors. ''' ), 

83] = __.accret.Dictionary( ) 

84mimetype_detectors: __.typx.Annotated[ 

85 __.accret.Dictionary[ str, MimetypeDetector ], 

86 __.ddoc.Doc( ''' Registry for MIME type detectors. ''' ), 

87] = __.accret.Dictionary( ) 

88 

89 

90def detect_charset( # noqa: PLR0913 

91 content: _nomina.Content, /, *, 

92 behaviors: _BehaviorsArgument = _BEHAVIORS_DEFAULT, 

93 default: _nomina.CharsetDefaultArgument = _CHARSET_DEFAULT, 

94 supplement: _nomina.CharsetSupplementArgument = __.absent, 

95 mimetype: _nomina.MimetypeAssumptionArgument = __.absent, 

96 location: _nomina.LocationArgument = __.absent, 

97) -> __.typx.Optional[ str ]: 

98 ''' Detects character set. ''' 

99 result = detect_charset_confidence( 

100 content, 

101 behaviors = behaviors, 

102 default = default, 

103 supplement = supplement, 

104 mimetype = mimetype, 

105 location = location ) 

106 return result.charset 

107 

108 

109def detect_charset_confidence( # noqa: PLR0913 

110 content: _nomina.Content, /, *, 

111 behaviors: _BehaviorsArgument = _BEHAVIORS_DEFAULT, 

112 default: _nomina.CharsetDefaultArgument = _CHARSET_DEFAULT, 

113 supplement: _nomina.CharsetSupplementArgument = __.absent, 

114 mimetype: _nomina.MimetypeAssumptionArgument = __.absent, 

115 location: _nomina.LocationArgument = __.absent, 

116) -> _CharsetResult: 

117 ''' Detects character set candidates with confidence scores. ''' 

118 if b'' == content: 

119 return _CharsetResult( charset = default, confidence = 1.0 ) 

120 for name in behaviors.charset_detectors_order: 

121 detector = charset_detectors.get( name ) 

122 if detector is None: continue 

123 result = detector( content, behaviors ) 

124 if result is NotImplemented: continue 

125 break 

126 else: 

127 match behaviors.charset_on_detect_failure: 

128 case _DetectFailureActions.Default: 

129 return _CharsetResult( charset = default, confidence = 0.0 ) 

130 case _: 

131 raise _exceptions.CharsetDetectFailure( location = location ) 

132 if result.charset is None: 

133 if __.is_absent( mimetype ): return result 

134 if not _mimetypes.is_textual_mimetype( mimetype ): return result 

135 result = _charsets.trial_decode_as_confident( 

136 content, 

137 behaviors = behaviors, 

138 supplement = supplement, 

139 location = location ) 

140 return _normalize_charset_detection( content, behaviors, result ) 

141 return _confirm_charset_detection( 

142 content, behaviors, result, 

143 supplement = supplement, location = location ) 

144 

145 

146def detect_mimetype( 

147 content: _nomina.Content, /, *, 

148 behaviors: _BehaviorsArgument = _BEHAVIORS_DEFAULT, 

149 default: _nomina.MimetypeDefaultArgument = _MIMETYPE_DEFAULT, 

150 charset: _nomina.CharsetAssumptionArgument = __.absent, 

151 location: _nomina.LocationArgument = __.absent, 

152) -> str: 

153 ''' Detects most probable MIME type. ''' 

154 nomargs: __.NominativeArguments = dict( 

155 behaviors = behaviors, 

156 default = default, 

157 charset = charset, 

158 location = location ) 

159 result = detect_mimetype_confidence( content, **nomargs ) 

160 return result.mimetype 

161 

162 

163def detect_mimetype_confidence( 

164 content: _nomina.Content, /, *, 

165 behaviors: _BehaviorsArgument = _BEHAVIORS_DEFAULT, 

166 default: _nomina.MimetypeDefaultArgument = _MIMETYPE_DEFAULT, 

167 charset: _nomina.CharsetAssumptionArgument = __.absent, 

168 location: _nomina.LocationArgument = __.absent, 

169) -> _MimetypeResult: 

170 ''' Detects MIME type candidates with confidence scores. ''' 

171 if b'' == content: 

172 return _MimetypeResult( mimetype = 'text/plain', confidence = 1.0 ) 

173 result: _MimetypeResult | __.types.NotImplementedType = NotImplemented 

174 for name in behaviors.mimetype_detectors_order: 

175 detector = mimetype_detectors.get( name ) 

176 if detector is None: continue 

177 result = detector( content, behaviors ) 

178 if result is not NotImplemented: break 

179 try_charset = ( 

180 result is NotImplemented or ( 

181 not _mimetypes.is_textual_mimetype( result.mimetype ) 

182 and result.confidence < behaviors.trial_decode_confidence ) ) 

183 if try_charset and not __.is_absent( charset ): 

184 # For charset validation, only try specified charset (no OS default) 

185 behaviors_charset_only = __.dcls.replace( 

186 behaviors, trial_codecs = ( _CodecSpecifiers.FromInference, ) ) 

187 result_from_charset = _detect_mimetype_from_charset( 

188 content, behaviors_charset_only, charset, 

189 default = default, location = location ) 

190 if result_from_charset.mimetype == 'text/plain': 

191 return result_from_charset 

192 if result is not NotImplemented: return result 

193 match behaviors.mimetype_on_detect_failure: 

194 case _DetectFailureActions.Default: 

195 return _MimetypeResult( mimetype = default, confidence = 0.0 ) 

196 case _: 

197 raise _exceptions.MimetypeDetectFailure( location = location ) 

198 

199 

200def _confirm_charset_detection( # noqa: PLR0911 

201 content: _nomina.Content, 

202 behaviors: _Behaviors, 

203 result: _CharsetResult, /, *, 

204 supplement: __.Absential[ str ] = __.absent, 

205 location: __.Absential[ _nomina.Location ] = __.absent, 

206) -> _CharsetResult: 

207 result = _normalize_charset_detection( content, behaviors, result ) 

208 if result.charset is None: return result # pragma: no cover 

209 charset, confidence = result.charset, result.confidence 

210 if charset.startswith( 'utf-' ): 

211 behaviors_no_fallback = __.dcls.replace( 

212 behaviors, 

213 trial_codecs = ( 

214 _CodecSpecifiers.UserSupplement, 

215 _CodecSpecifiers.FromInference ) ) 

216 result = _charsets.trial_decode_as_confident( 

217 content, 

218 behaviors = behaviors_no_fallback, 

219 supplement = supplement, 

220 inference = charset, 

221 confidence = confidence, 

222 location = location ) 

223 return _normalize_charset_detection( content, behaviors, result ) 

224 match behaviors.trial_decode: 

225 case _BehaviorTristate.Never: return result 

226 case _: # Shake out false positives, like 'MacRoman'. 

227 if charset == _charsets.discover_os_charset_default( ): 

228 # Allow 'windows-1252', etc..., as appropriate. 

229 return result # pragma: no cover 

230 # Try UTF-8 to shake out false positives, but not OS default. 

231 behaviors_utf8_only = __.dcls.replace( 

232 behaviors, 

233 trial_codecs = ( 

234 _CodecSpecifiers.UserSupplement, 

235 _CodecSpecifiers.FromInference ) ) 

236 try: 

237 _, result_ = _charsets.attempt_decodes( 

238 content, 

239 behaviors = behaviors_utf8_only, 

240 inference = 'utf-8-sig', 

241 supplement = supplement, 

242 location = location ) 

243 except _exceptions.ContentDecodeFailure: return result 

244 if charset == result_.charset: return result # pragma: no cover 

245 result_ = _CharsetResult( 

246 charset = result_.charset, confidence = confidence ) 

247 return _normalize_charset_detection( content, behaviors, result_ ) 

248 

249 

250def _detect_mimetype_from_charset( 

251 content: _nomina.Content, 

252 behaviors: _Behaviors, 

253 charset: str, /, *, 

254 default: str, 

255 location: __.Absential[ _nomina.Location ], 

256) -> _MimetypeResult: 

257 should_error = False 

258 match behaviors.mimetype_on_detect_failure: 

259 case _DetectFailureActions.Default: pass 

260 case _: should_error = True 

261 error = _exceptions.MimetypeDetectFailure( location = location ) 

262 result_default = _MimetypeResult( mimetype = default, confidence = 0.0 ) 

263 match behaviors.trial_decode: 

264 case _BehaviorTristate.Never: 

265 if should_error: raise error 

266 return result_default 

267 case _: pass 

268 try: 

269 text, charset_result = _charsets.attempt_decodes( 

270 content, 

271 behaviors = behaviors, inference = charset, location = location ) 

272 except _exceptions.ContentDecodeFailure: 

273 if should_error: raise error from None 

274 return result_default 

275 match behaviors.text_validate: 

276 case _BehaviorTristate.Never: 

277 if should_error: raise error 

278 return result_default 

279 case _: pass 

280 if not _validation.PROFILE_TEXTUAL( text ): 

281 if should_error: raise error 

282 return result_default 

283 return _MimetypeResult( 

284 mimetype = 'text/plain', confidence = charset_result.confidence ) 

285 

286 

287def _detect_via_chardet( 

288 content: _nomina.Content, behaviors: _Behaviors 

289) -> _CharsetResult | __.types.NotImplementedType: 

290 try: import chardet # pragma: no cover 

291 except ImportError: return NotImplemented # pragma: no cover 

292 result_ = chardet.detect( content ) 

293 charset, confidence = result_[ 'encoding' ], result_[ 'confidence' ] 

294 return _CharsetResult( charset = charset, confidence = confidence ) 

295 

296charset_detectors[ 'chardet' ] = _detect_via_chardet 

297 

298 

299def _detect_via_charset_normalizer( 

300 content: _nomina.Content, behaviors: _Behaviors 

301) -> _CharsetResult | __.types.NotImplementedType: 

302 try: import charset_normalizer # pragma: no cover 

303 except ImportError: return NotImplemented # pragma: no cover 

304 result_ = charset_normalizer.from_bytes( content ).best( ) 

305 charset = None if result_ is None else result_.encoding # pragma: no cover 

306 confidence = _core.confidence_from_bytes_quantity( 

307 content, behaviors = behaviors ) 

308 return _CharsetResult( charset = charset, confidence = confidence ) 

309 

310charset_detectors[ 'charset-normalizer' ] = _detect_via_charset_normalizer 

311 

312 

313def _detect_via_magic( 

314 content: _nomina.Content, behaviors: _Behaviors 

315) -> _MimetypeResult | __.types.NotImplementedType: 

316 try: import magic # pragma: no cover 

317 except ImportError: return NotImplemented # pragma: no cover 

318 try: mimetype = magic.from_buffer( content, mime = True ) 

319 except Exception: return NotImplemented # pragma: no cover 

320 confidence = _core.confidence_from_bytes_quantity( 

321 content, behaviors = behaviors ) 

322 return _MimetypeResult( mimetype = mimetype, confidence = confidence ) 

323 

324mimetype_detectors[ 'magic' ] = _detect_via_magic 

325 

326 

327def _detect_via_puremagic( 

328 content: _nomina.Content, behaviors: _Behaviors 

329) -> _MimetypeResult | __.types.NotImplementedType: 

330 try: import puremagic # pragma: no cover 

331 except ImportError: return NotImplemented # pragma: no cover 

332 try: mimetype = puremagic.from_string( content, mime = True ) 

333 except ( puremagic.PureError, ValueError ): # pragma: no cover 

334 return NotImplemented 

335 confidence = _core.confidence_from_bytes_quantity( 

336 content, behaviors = behaviors ) 

337 return _MimetypeResult( mimetype = mimetype, confidence = confidence ) 

338 

339mimetype_detectors[ 'puremagic' ] = _detect_via_puremagic 

340 

341 

342def _normalize_charset_detection( 

343 content: _nomina.Content, _behaviors: _Behaviors, result: _CharsetResult 

344) -> _CharsetResult: 

345 if result.charset is None: return result # pragma: no cover 

346 charset = _charsets.normalize_charset_for_content( 

347 content, result.charset ) 

348 return _CharsetResult( charset = charset, confidence = result.confidence )