Coverage for sources/detextive/detectors.py: 100%

122 statements  

« prev     ^ index     » next       coverage.py v7.10.6, created at 2025-09-20 18:02 +0000

1# vim: set filetype=python fileencoding=utf-8: 

2# -*- coding: utf-8 -*- 

3 

4#============================================================================# 

5# # 

6# Licensed under the Apache License, Version 2.0 (the "License"); # 

7# you may not use this file except in compliance with the License. # 

8# You may obtain a copy of the License at # 

9# # 

10# http://www.apache.org/licenses/LICENSE-2.0 # 

11# # 

12# Unless required by applicable law or agreed to in writing, software # 

13# distributed under the License is distributed on an "AS IS" BASIS, # 

14# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # 

15# See the License for the specific language governing permissions and # 

16# limitations under the License. # 

17# # 

18#============================================================================# 

19 

20 

21''' Core detection function implementations. ''' 

22 

23 

24from . import __ 

25from . import charsets as _charsets 

26from . import core as _core 

27from . import exceptions as _exceptions 

28from . import mimetypes as _mimetypes 

29from . import nomina as _nomina 

30from . import validation as _validation 

31 

32from .core import ( # isort: skip 

33 BEHAVIORS_DEFAULT as _BEHAVIORS_DEFAULT, 

34 CHARSET_DEFAULT as _CHARSET_DEFAULT, 

35 MIMETYPE_DEFAULT as _MIMETYPE_DEFAULT, 

36 BehaviorTristate as _BehaviorTristate, 

37 Behaviors as _Behaviors, 

38 BehaviorsArgument as _BehaviorsArgument, 

39 CharsetResult as _CharsetResult, 

40 DetectFailureActions as _DetectFailureActions, 

41 MimetypeResult as _MimetypeResult, 

42) 

43 

44 

45CharsetDetector: __.typx.TypeAlias = __.typx.Annotated[ 

46 __.cabc.Callable[ 

47 [ _nomina.Content, _Behaviors ], 

48 _CharsetResult | __.types.NotImplementedType 

49 ], 

50 __.ddoc.Doc( 

51 ''' Character set detector function. 

52 

53 Takes bytes content and behaviors object. 

54 

55 Returns either a detection result or ``NotImplemented``. The 

56 detection result will include the name of the character set, which 

57 has been determined as able to decode the content, or ``None``, if 

58 it believes that no character set is applicable to the content, and 

59 the confidence of the detection. 

60 ''' ), 

61] 

62MimetypeDetector: __.typx.TypeAlias = __.typx.Annotated[ 

63 __.cabc.Callable[ 

64 [ _nomina.Content, _Behaviors ], 

65 _MimetypeResult | __.types.NotImplementedType, 

66 ], 

67 __.ddoc.Doc( 

68 ''' MIME type detector function. 

69 

70 Takes bytes content and behaviors object. 

71 

72 Returns either a detection result or ``NotImplemented``. The 

73 detection result will include the MIME type and the confidence of 

74 the detection. 

75 ''' ), 

76] 

77 

78 

79charset_detectors: __.typx.Annotated[ 

80 __.accret.Dictionary[ str, CharsetDetector ], 

81 __.ddoc.Doc( ''' Registry for character set detectors. ''' ), 

82] = __.accret.Dictionary( ) 

83mimetype_detectors: __.typx.Annotated[ 

84 __.accret.Dictionary[ str, MimetypeDetector ], 

85 __.ddoc.Doc( ''' Registry for MIME type detectors. ''' ), 

86] = __.accret.Dictionary( ) 

87 

88 

89def detect_charset( # noqa: PLR0913 

90 content: _nomina.Content, /, *, 

91 behaviors: _BehaviorsArgument = _BEHAVIORS_DEFAULT, 

92 default: _nomina.CharsetDefaultArgument = _CHARSET_DEFAULT, 

93 supplement: _nomina.CharsetSupplementArgument = __.absent, 

94 mimetype: _nomina.MimetypeAssumptionArgument = __.absent, 

95 location: _nomina.LocationArgument = __.absent, 

96) -> __.typx.Optional[ str ]: 

97 ''' Detects character set. ''' 

98 result = detect_charset_confidence( 

99 content, 

100 behaviors = behaviors, 

101 default = default, 

102 supplement = supplement, 

103 mimetype = mimetype, 

104 location = location ) 

105 return result.charset 

106 

107 

108def detect_charset_confidence( # noqa: PLR0913 

109 content: _nomina.Content, /, *, 

110 behaviors: _BehaviorsArgument = _BEHAVIORS_DEFAULT, 

111 default: _nomina.CharsetDefaultArgument = _CHARSET_DEFAULT, 

112 supplement: _nomina.CharsetSupplementArgument = __.absent, 

113 mimetype: _nomina.MimetypeAssumptionArgument = __.absent, 

114 location: _nomina.LocationArgument = __.absent, 

115) -> _CharsetResult: 

116 ''' Detects character set candidates with confidence scores. ''' 

117 if b'' == content: 

118 return _CharsetResult( charset = 'utf-8', confidence = 1.0 ) 

119 for name in behaviors.charset_detectors_order: 

120 detector = charset_detectors.get( name ) 

121 if detector is None: continue 

122 result = detector( content, behaviors ) 

123 if result is NotImplemented: continue 

124 break 

125 else: 

126 match behaviors.charset_on_detect_failure: 

127 case _DetectFailureActions.Default: 

128 return _CharsetResult( charset = default, confidence = 0.0 ) 

129 case _: 

130 raise _exceptions.CharsetDetectFailure( location = location ) 

131 if result.charset is None: 

132 if __.is_absent( mimetype ): return result 

133 if not _mimetypes.is_textual_mimetype( mimetype ): return result 

134 result = _charsets.trial_decode_as_confident( 

135 content, 

136 behaviors = behaviors, 

137 supplement = supplement, 

138 location = location ) 

139 return _normalize_charset_detection( content, behaviors, result ) 

140 return _confirm_charset_detection( 

141 content, behaviors, result, 

142 supplement = supplement, location = location ) 

143 

144 

145def detect_mimetype( 

146 content: _nomina.Content, /, *, 

147 behaviors: _BehaviorsArgument = _BEHAVIORS_DEFAULT, 

148 default: _nomina.MimetypeDefaultArgument = _MIMETYPE_DEFAULT, 

149 charset: _nomina.CharsetAssumptionArgument = __.absent, 

150 location: _nomina.LocationArgument = __.absent, 

151) -> str: 

152 ''' Detects most probable MIME type. ''' 

153 nomargs: __.NominativeArguments = dict( 

154 behaviors = behaviors, 

155 default = default, 

156 charset = charset, 

157 location = location ) 

158 result = detect_mimetype_confidence( content, **nomargs ) 

159 return result.mimetype 

160 

161 

162def detect_mimetype_confidence( 

163 content: _nomina.Content, /, *, 

164 behaviors: _BehaviorsArgument = _BEHAVIORS_DEFAULT, 

165 default: _nomina.MimetypeDefaultArgument = _MIMETYPE_DEFAULT, 

166 charset: _nomina.CharsetAssumptionArgument = __.absent, 

167 location: _nomina.LocationArgument = __.absent, 

168) -> _MimetypeResult: 

169 ''' Detects MIME type candidates with confidence scores. ''' 

170 if b'' == content: 

171 return _MimetypeResult( mimetype = 'text/plain', confidence = 1.0 ) 

172 for name in behaviors.mimetype_detectors_order: 

173 detector = mimetype_detectors.get( name ) 

174 if detector is None: continue 

175 result = detector( content, behaviors ) 

176 if result is NotImplemented: continue 

177 return result 

178 if __.is_absent( charset ): 

179 match behaviors.mimetype_on_detect_failure: 

180 case _DetectFailureActions.Default: 

181 return _MimetypeResult( mimetype = default, confidence = 0.0 ) 

182 case _: 

183 raise _exceptions.MimetypeDetectFailure( location = location ) 

184 return _detect_mimetype_from_charset( 

185 content, behaviors, charset, default = default, location = location ) 

186 

187 

188def _confirm_charset_detection( # noqa: PLR0911 

189 content: _nomina.Content, 

190 behaviors: _Behaviors, 

191 result: _CharsetResult, /, *, 

192 supplement: __.Absential[ str ] = __.absent, 

193 location: __.Absential[ _nomina.Location ] = __.absent, 

194) -> _CharsetResult: 

195 result = _normalize_charset_detection( content, behaviors, result ) 

196 if result.charset is None: return result # pragma: no cover 

197 charset, confidence = result.charset, result.confidence 

198 charset = behaviors.charset_promotions.get( charset, charset ) 

199 if charset.startswith( 'utf-' ): 

200 result = _charsets.trial_decode_as_confident( 

201 content, 

202 behaviors = behaviors, 

203 supplement = supplement, 

204 inference = charset, 

205 confidence = confidence, 

206 location = location ) 

207 return _normalize_charset_detection( content, behaviors, result ) 

208 result = _CharsetResult( charset = charset, confidence = confidence ) 

209 match behaviors.trial_decode: 

210 case _BehaviorTristate.Never: return result 

211 case _: # Shake out false positives, like 'MacRoman'. 

212 if charset == _charsets.discover_os_charset_default( ): 

213 # Allow 'windows-1252', etc..., as appropriate. 

214 return result # pragma: no cover 

215 try: 

216 _, result_ = _charsets.attempt_decodes( 

217 content, 

218 behaviors = behaviors, 

219 inference = 'utf-8-sig', 

220 supplement = supplement, 

221 location = location ) 

222 except _exceptions.ContentDecodeFailure: return result 

223 if charset == result_.charset: return result # pragma: no cover 

224 return _normalize_charset_detection( content, behaviors, result_ ) 

225 

226 

227def _detect_mimetype_from_charset( 

228 content: _nomina.Content, 

229 behaviors: _Behaviors, 

230 charset: str, /, *, 

231 default: str, 

232 location: __.Absential[ _nomina.Location ], 

233) -> _MimetypeResult: 

234 should_error = False 

235 match behaviors.mimetype_on_detect_failure: 

236 case _DetectFailureActions.Default: pass 

237 case _: should_error = True 

238 error = _exceptions.MimetypeDetectFailure( location = location ) 

239 result_default = _MimetypeResult( mimetype = default, confidence = 0.0 ) 

240 match behaviors.trial_decode: 

241 case _BehaviorTristate.Never: 

242 if should_error: raise error 

243 return result_default 

244 case _: pass 

245 try: 

246 text, charset_result = _charsets.attempt_decodes( 

247 content, 

248 behaviors = behaviors, inference = charset, location = location ) 

249 except _exceptions.ContentDecodeFailure: 

250 if should_error: raise error from None 

251 return result_default 

252 match behaviors.text_validate: 

253 case _BehaviorTristate.Never: 

254 if should_error: raise error 

255 return result_default 

256 case _: pass 

257 if not _validation.PROFILE_TEXTUAL( text ): 

258 if should_error: raise error 

259 return result_default 

260 return _MimetypeResult( 

261 mimetype = 'text/plain', confidence = charset_result.confidence ) 

262 

263 

264def _detect_via_chardet( 

265 content: _nomina.Content, behaviors: _Behaviors 

266) -> _CharsetResult | __.types.NotImplementedType: 

267 try: import chardet # pragma: no cover 

268 except ImportError: return NotImplemented # pragma: no cover 

269 result_ = chardet.detect( content ) 

270 charset, confidence = result_[ 'encoding' ], result_[ 'confidence' ] 

271 return _CharsetResult( charset = charset, confidence = confidence ) 

272 

273charset_detectors[ 'chardet' ] = _detect_via_chardet 

274 

275 

276def _detect_via_charset_normalizer( 

277 content: _nomina.Content, behaviors: _Behaviors 

278) -> _CharsetResult | __.types.NotImplementedType: 

279 try: import charset_normalizer # pragma: no cover 

280 except ImportError: return NotImplemented # pragma: no cover 

281 result_ = charset_normalizer.from_bytes( content ).best( ) 

282 charset = None if result_ is None else result_.encoding # pragma: no cover 

283 confidence = _core.confidence_from_bytes_quantity( 

284 content, behaviors = behaviors ) 

285 return _CharsetResult( charset = charset, confidence = confidence ) 

286 

287charset_detectors[ 'charset-normalizer' ] = _detect_via_charset_normalizer 

288 

289 

290def _detect_via_magic( 

291 content: _nomina.Content, behaviors: _Behaviors 

292) -> _MimetypeResult | __.types.NotImplementedType: 

293 try: import magic # pragma: no cover 

294 except ImportError: return NotImplemented # pragma: no cover 

295 try: mimetype = magic.from_buffer( content, mime = True ) 

296 except Exception: return NotImplemented # pragma: no cover 

297 confidence = _core.confidence_from_bytes_quantity( 

298 content, behaviors = behaviors ) 

299 return _MimetypeResult( mimetype = mimetype, confidence = confidence ) 

300 

301mimetype_detectors[ 'magic' ] = _detect_via_magic 

302 

303 

304def _detect_via_puremagic( 

305 content: _nomina.Content, behaviors: _Behaviors 

306) -> _MimetypeResult | __.types.NotImplementedType: 

307 try: import puremagic # pragma: no cover 

308 except ImportError: return NotImplemented # pragma: no cover 

309 try: mimetype = puremagic.from_string( content, mime = True ) 

310 except ( puremagic.PureError, ValueError ): # pragma: no cover 

311 return NotImplemented 

312 confidence = _core.confidence_from_bytes_quantity( 

313 content, behaviors = behaviors ) 

314 return _MimetypeResult( mimetype = mimetype, confidence = confidence ) 

315 

316mimetype_detectors[ 'puremagic' ] = _detect_via_puremagic 

317 

318 

319def _normalize_charset_detection( 

320 content: _nomina.Content, behaviors: _Behaviors, result: _CharsetResult 

321) -> _CharsetResult: 

322 if result.charset is None: return result # pragma: no cover 

323 charset = _charsets.normalize_charset( result.charset ) 

324 # TODO? Consider endianness variations for BOM. 

325 if charset == 'utf-8-sig' and not content.startswith( __.codecs.BOM ): 

326 charset = 'utf-8' 

327 return _CharsetResult( charset = charset, confidence = result.confidence )