Coverage for sources/detextive/detectors.py: 100%

1# vim: set filetype=python fileencoding=utf-8:

2# -*- coding: utf-8 -*-

4#============================================================================#

5# #

6# Licensed under the Apache License, Version 2.0 (the "License"); #

7# you may not use this file except in compliance with the License. #

8# You may obtain a copy of the License at #

9# #

10# http://www.apache.org/licenses/LICENSE-2.0 #

11# #

12# Unless required by applicable law or agreed to in writing, software #

13# distributed under the License is distributed on an "AS IS" BASIS, #

14# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. #

15# See the License for the specific language governing permissions and #

16# limitations under the License. #

17# #

18#============================================================================#

21''' Core detection function implementations. '''

24from . import __

25from . import charsets as _charsets

26from . import core as _core

27from . import exceptions as _exceptions

28from . import mimetypes as _mimetypes

29from . import nomina as _nomina

30from . import validation as _validation

32from .core import ( # isort: skip

33 BEHAVIORS_DEFAULT as _BEHAVIORS_DEFAULT,

34 CHARSET_DEFAULT as _CHARSET_DEFAULT,

35 MIMETYPE_DEFAULT as _MIMETYPE_DEFAULT,

36 BehaviorTristate as _BehaviorTristate,

37 Behaviors as _Behaviors,

38 BehaviorsArgument as _BehaviorsArgument,

39 CharsetResult as _CharsetResult,

40 CodecSpecifiers as _CodecSpecifiers,

41 DetectFailureActions as _DetectFailureActions,

42 MimetypeResult as _MimetypeResult,

43)

46CharsetDetector: __.typx.TypeAlias = __.typx.Annotated[

47 __.cabc.Callable[

48 [ _nomina.Content, _Behaviors ],

49 _CharsetResult | __.types.NotImplementedType

50 ],

51 __.ddoc.Doc(

52 ''' Character set detector function.

54 Takes bytes content and behaviors object.

56 Returns either a detection result or ``NotImplemented``. The

57 detection result will include the name of the character set, which

58 has been determined as able to decode the content, or ``None``, if

59 it believes that no character set is applicable to the content, and

60 the confidence of the detection.

61 ''' ),

62]

63MimetypeDetector: __.typx.TypeAlias = __.typx.Annotated[

64 __.cabc.Callable[

65 [ _nomina.Content, _Behaviors ],

66 _MimetypeResult | __.types.NotImplementedType,

67 ],

68 __.ddoc.Doc(

69 ''' MIME type detector function.

71 Takes bytes content and behaviors object.

73 Returns either a detection result or ``NotImplemented``. The

74 detection result will include the MIME type and the confidence of

75 the detection.

76 ''' ),

77]

80charset_detectors: __.typx.Annotated[

81 __.accret.Dictionary[ str, CharsetDetector ],

82 __.ddoc.Doc( ''' Registry for character set detectors. ''' ),

83] = __.accret.Dictionary( )

84mimetype_detectors: __.typx.Annotated[

85 __.accret.Dictionary[ str, MimetypeDetector ],

86 __.ddoc.Doc( ''' Registry for MIME type detectors. ''' ),

87] = __.accret.Dictionary( )

90def detect_charset( # noqa: PLR0913

91 content: _nomina.Content, /, *,

92 behaviors: _BehaviorsArgument = _BEHAVIORS_DEFAULT,

93 default: _nomina.CharsetDefaultArgument = _CHARSET_DEFAULT,

94 supplement: _nomina.CharsetSupplementArgument = __.absent,

95 mimetype: _nomina.MimetypeAssumptionArgument = __.absent,

96 location: _nomina.LocationArgument = __.absent,

97) -> __.typx.Optional[ str ]:

98 ''' Detects character set. '''

99 result = detect_charset_confidence(

100 content,

101 behaviors = behaviors,

102 default = default,

103 supplement = supplement,

104 mimetype = mimetype,

105 location = location )

106 return result.charset

107

108

109def detect_charset_confidence( # noqa: PLR0913

110 content: _nomina.Content, /, *,

111 behaviors: _BehaviorsArgument = _BEHAVIORS_DEFAULT,

112 default: _nomina.CharsetDefaultArgument = _CHARSET_DEFAULT,

113 supplement: _nomina.CharsetSupplementArgument = __.absent,

114 mimetype: _nomina.MimetypeAssumptionArgument = __.absent,

115 location: _nomina.LocationArgument = __.absent,

116) -> _CharsetResult:

117 ''' Detects character set candidates with confidence scores. '''

118 if b'' == content:

119 return _CharsetResult( charset = default, confidence = 1.0 )

120 for name in behaviors.charset_detectors_order:

121 detector = charset_detectors.get( name )

122 if detector is None: continue

123 result = detector( content, behaviors )

124 if result is NotImplemented: continue

125 break

126 else:

127 match behaviors.charset_on_detect_failure:

128 case _DetectFailureActions.Default:

129 return _CharsetResult( charset = default, confidence = 0.0 )

130 case _:

131 raise _exceptions.CharsetDetectFailure( location = location )

132 if result.charset is None:

133 if __.is_absent( mimetype ): return result

134 if not _mimetypes.is_textual_mimetype( mimetype ): return result

135 result = _charsets.trial_decode_as_confident(

136 content,

137 behaviors = behaviors,

138 supplement = supplement,

139 location = location )

140 return _normalize_charset_detection( content, behaviors, result )

141 return _confirm_charset_detection(

142 content, behaviors, result,

143 supplement = supplement, location = location )

144

145

146def detect_mimetype(

147 content: _nomina.Content, /, *,

148 behaviors: _BehaviorsArgument = _BEHAVIORS_DEFAULT,

149 default: _nomina.MimetypeDefaultArgument = _MIMETYPE_DEFAULT,

150 charset: _nomina.CharsetAssumptionArgument = __.absent,

151 location: _nomina.LocationArgument = __.absent,

152) -> str:

153 ''' Detects most probable MIME type. '''

154 nomargs: __.NominativeArguments = dict(

155 behaviors = behaviors,

156 default = default,

157 charset = charset,

158 location = location )

159 result = detect_mimetype_confidence( content, **nomargs )

160 return result.mimetype

161

162

163def detect_mimetype_confidence(

164 content: _nomina.Content, /, *,

165 behaviors: _BehaviorsArgument = _BEHAVIORS_DEFAULT,

166 default: _nomina.MimetypeDefaultArgument = _MIMETYPE_DEFAULT,

167 charset: _nomina.CharsetAssumptionArgument = __.absent,

168 location: _nomina.LocationArgument = __.absent,

169) -> _MimetypeResult:

170 ''' Detects MIME type candidates with confidence scores. '''

171 if b'' == content:

172 return _MimetypeResult( mimetype = 'text/plain', confidence = 1.0 )

173 result: _MimetypeResult | __.types.NotImplementedType = NotImplemented

174 for name in behaviors.mimetype_detectors_order:

175 detector = mimetype_detectors.get( name )

176 if detector is None: continue

177 result = detector( content, behaviors )

178 if result is not NotImplemented: break

179 try_charset = (

180 result is NotImplemented or (

181 not _mimetypes.is_textual_mimetype( result.mimetype )

182 and result.confidence < behaviors.trial_decode_confidence ) )

183 if try_charset and not __.is_absent( charset ):

184 # For charset validation, only try specified charset (no OS default)

185 behaviors_charset_only = __.dcls.replace(

186 behaviors, trial_codecs = ( _CodecSpecifiers.FromInference, ) )

187 result_from_charset = _detect_mimetype_from_charset(

188 content, behaviors_charset_only, charset,

189 default = default, location = location )

190 if result_from_charset.mimetype == 'text/plain':

191 return result_from_charset

192 if result is not NotImplemented: return result

193 match behaviors.mimetype_on_detect_failure:

194 case _DetectFailureActions.Default:

195 return _MimetypeResult( mimetype = default, confidence = 0.0 )

196 case _:

197 raise _exceptions.MimetypeDetectFailure( location = location )

198

199

200def _confirm_charset_detection( # noqa: PLR0911

201 content: _nomina.Content,

202 behaviors: _Behaviors,

203 result: _CharsetResult, /, *,

204 supplement: __.Absential[ str ] = __.absent,

205 location: __.Absential[ _nomina.Location ] = __.absent,

206) -> _CharsetResult:

207 result = _normalize_charset_detection( content, behaviors, result )

208 if result.charset is None: return result # pragma: no cover

209 charset, confidence = result.charset, result.confidence

210 if charset.startswith( 'utf-' ):

211 behaviors_no_fallback = __.dcls.replace(

212 behaviors,

213 trial_codecs = (

214 _CodecSpecifiers.UserSupplement,

215 _CodecSpecifiers.FromInference ) )

216 result = _charsets.trial_decode_as_confident(

217 content,

218 behaviors = behaviors_no_fallback,

219 supplement = supplement,

220 inference = charset,

221 confidence = confidence,

222 location = location )

223 return _normalize_charset_detection( content, behaviors, result )

224 match behaviors.trial_decode:

225 case _BehaviorTristate.Never: return result

226 case _: # Shake out false positives, like 'MacRoman'.

227 if charset == _charsets.discover_os_charset_default( ):

228 # Allow 'windows-1252', etc..., as appropriate.

229 return result # pragma: no cover

230 # Try UTF-8 to shake out false positives, but not OS default.

231 behaviors_utf8_only = __.dcls.replace(

232 behaviors,

233 trial_codecs = (

234 _CodecSpecifiers.UserSupplement,

235 _CodecSpecifiers.FromInference ) )

236 try:

237 _, result_ = _charsets.attempt_decodes(

238 content,

239 behaviors = behaviors_utf8_only,

240 inference = 'utf-8-sig',

241 supplement = supplement,

242 location = location )

243 except _exceptions.ContentDecodeFailure: return result

244 if charset == result_.charset: return result # pragma: no cover

245 result_ = _CharsetResult(

246 charset = result_.charset, confidence = confidence )

247 return _normalize_charset_detection( content, behaviors, result_ )

248

249

250def _detect_mimetype_from_charset(

251 content: _nomina.Content,

252 behaviors: _Behaviors,

253 charset: str, /, *,

254 default: str,

255 location: __.Absential[ _nomina.Location ],

256) -> _MimetypeResult:

257 should_error = False

258 match behaviors.mimetype_on_detect_failure:

259 case _DetectFailureActions.Default: pass

260 case _: should_error = True

261 error = _exceptions.MimetypeDetectFailure( location = location )

262 result_default = _MimetypeResult( mimetype = default, confidence = 0.0 )

263 match behaviors.trial_decode:

264 case _BehaviorTristate.Never:

265 if should_error: raise error

266 return result_default

267 case _: pass

268 try:

269 text, charset_result = _charsets.attempt_decodes(

270 content,

271 behaviors = behaviors, inference = charset, location = location )

272 except _exceptions.ContentDecodeFailure:

273 if should_error: raise error from None

274 return result_default

275 match behaviors.text_validate:

276 case _BehaviorTristate.Never:

277 if should_error: raise error

278 return result_default

279 case _: pass

280 if not _validation.PROFILE_TEXTUAL( text ):

281 if should_error: raise error

282 return result_default

283 return _MimetypeResult(

284 mimetype = 'text/plain', confidence = charset_result.confidence )

285

286

287def _detect_via_chardet(

288 content: _nomina.Content, behaviors: _Behaviors

289) -> _CharsetResult | __.types.NotImplementedType:

290 try: import chardet # pragma: no cover

291 except ImportError: return NotImplemented # pragma: no cover

292 result_ = chardet.detect( content )

293 charset, confidence = result_[ 'encoding' ], result_[ 'confidence' ]

294 return _CharsetResult( charset = charset, confidence = confidence )

295

296charset_detectors[ 'chardet' ] = _detect_via_chardet

297

298

299def _detect_via_charset_normalizer(

300 content: _nomina.Content, behaviors: _Behaviors

301) -> _CharsetResult | __.types.NotImplementedType:

302 try: import charset_normalizer # pragma: no cover

303 except ImportError: return NotImplemented # pragma: no cover

304 result_ = charset_normalizer.from_bytes( content ).best( )

305 charset = None if result_ is None else result_.encoding # pragma: no cover

306 confidence = _core.confidence_from_bytes_quantity(

307 content, behaviors = behaviors )

308 return _CharsetResult( charset = charset, confidence = confidence )

309

310charset_detectors[ 'charset-normalizer' ] = _detect_via_charset_normalizer

311

312

313def _detect_via_magic(

314 content: _nomina.Content, behaviors: _Behaviors

315) -> _MimetypeResult | __.types.NotImplementedType:

316 try: import magic # pragma: no cover

317 except ImportError: return NotImplemented # pragma: no cover

318 try: mimetype = magic.from_buffer( content, mime = True )

319 except Exception: return NotImplemented # pragma: no cover

320 confidence = _core.confidence_from_bytes_quantity(

321 content, behaviors = behaviors )

322 return _MimetypeResult( mimetype = mimetype, confidence = confidence )

323

324mimetype_detectors[ 'magic' ] = _detect_via_magic

325

326

327def _detect_via_puremagic(

328 content: _nomina.Content, behaviors: _Behaviors

329) -> _MimetypeResult | __.types.NotImplementedType:

330 try: import puremagic # pragma: no cover

331 except ImportError: return NotImplemented # pragma: no cover

332 try: mimetype = puremagic.from_string( content, mime = True )

333 except ( puremagic.PureError, ValueError ): # pragma: no cover

334 return NotImplemented

335 confidence = _core.confidence_from_bytes_quantity(

336 content, behaviors = behaviors )

337 return _MimetypeResult( mimetype = mimetype, confidence = confidence )

338

339mimetype_detectors[ 'puremagic' ] = _detect_via_puremagic

340

341

342def _normalize_charset_detection(

343 content: _nomina.Content, behaviors: _Behaviors, result: _CharsetResult

344) -> _CharsetResult:

345 if result.charset is None: return result # pragma: no cover

346 charset = _charsets.normalize_charset( result.charset )

347 # TODO? Consider endianness variations for BOM.

348 if charset == 'utf-8-sig' and not content.startswith( __.codecs.BOM ):

349 charset = 'utf-8'

350 return _CharsetResult( charset = charset, confidence = result.confidence )

Coverage for sources / detextive / detectors.py: 100%

128 statements