Coverage for sources/detextive/detectors.py: 100%

1# vim: set filetype=python fileencoding=utf-8:

2# -*- coding: utf-8 -*-

4#============================================================================#

5# #

6# Licensed under the Apache License, Version 2.0 (the "License"); #

7# you may not use this file except in compliance with the License. #

8# You may obtain a copy of the License at #

9# #

10# http://www.apache.org/licenses/LICENSE-2.0 #

11# #

12# Unless required by applicable law or agreed to in writing, software #

13# distributed under the License is distributed on an "AS IS" BASIS, #

14# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. #

15# See the License for the specific language governing permissions and #

16# limitations under the License. #

17# #

18#============================================================================#

21''' Core detection function implementations. '''

24from . import __

25from . import charsets as _charsets

26from . import core as _core

27from . import exceptions as _exceptions

28from . import mimetypes as _mimetypes

29from . import nomina as _nomina

30from . import validation as _validation

32from .core import ( # isort: skip

33 BEHAVIORS_DEFAULT as _BEHAVIORS_DEFAULT,

34 CHARSET_DEFAULT as _CHARSET_DEFAULT,

35 MIMETYPE_DEFAULT as _MIMETYPE_DEFAULT,

36 BehaviorTristate as _BehaviorTristate,

37 Behaviors as _Behaviors,

38 BehaviorsArgument as _BehaviorsArgument,

39 CharsetResult as _CharsetResult,

40 DetectFailureActions as _DetectFailureActions,

41 MimetypeResult as _MimetypeResult,

42)

45CharsetDetector: __.typx.TypeAlias = __.typx.Annotated[

46 __.cabc.Callable[

47 [ _nomina.Content, _Behaviors ],

48 _CharsetResult | __.types.NotImplementedType

49 ],

50 __.ddoc.Doc(

51 ''' Character set detector function.

53 Takes bytes content and behaviors object.

55 Returns either a detection result or ``NotImplemented``. The

56 detection result will include the name of the character set, which

57 has been determined as able to decode the content, or ``None``, if

58 it believes that no character set is applicable to the content, and

59 the confidence of the detection.

60 ''' ),

61]

62MimetypeDetector: __.typx.TypeAlias = __.typx.Annotated[

63 __.cabc.Callable[

64 [ _nomina.Content, _Behaviors ],

65 _MimetypeResult | __.types.NotImplementedType,

66 ],

67 __.ddoc.Doc(

68 ''' MIME type detector function.

70 Takes bytes content and behaviors object.

72 Returns either a detection result or ``NotImplemented``. The

73 detection result will include the MIME type and the confidence of

74 the detection.

75 ''' ),

76]

79charset_detectors: __.typx.Annotated[

80 __.accret.Dictionary[ str, CharsetDetector ],

81 __.ddoc.Doc( ''' Registry for character set detectors. ''' ),

82] = __.accret.Dictionary( )

83mimetype_detectors: __.typx.Annotated[

84 __.accret.Dictionary[ str, MimetypeDetector ],

85 __.ddoc.Doc( ''' Registry for MIME type detectors. ''' ),

86] = __.accret.Dictionary( )

89def detect_charset( # noqa: PLR0913

90 content: _nomina.Content, /, *,

91 behaviors: _BehaviorsArgument = _BEHAVIORS_DEFAULT,

92 default: _nomina.CharsetDefaultArgument = _CHARSET_DEFAULT,

93 supplement: _nomina.CharsetSupplementArgument = __.absent,

94 mimetype: _nomina.MimetypeAssumptionArgument = __.absent,

95 location: _nomina.LocationArgument = __.absent,

96) -> __.typx.Optional[ str ]:

97 ''' Detects character set. '''

98 result = detect_charset_confidence(

99 content,

100 behaviors = behaviors,

101 default = default,

102 supplement = supplement,

103 mimetype = mimetype,

104 location = location )

105 return result.charset

106

107

108def detect_charset_confidence( # noqa: PLR0913

109 content: _nomina.Content, /, *,

110 behaviors: _BehaviorsArgument = _BEHAVIORS_DEFAULT,

111 default: _nomina.CharsetDefaultArgument = _CHARSET_DEFAULT,

112 supplement: _nomina.CharsetSupplementArgument = __.absent,

113 mimetype: _nomina.MimetypeAssumptionArgument = __.absent,

114 location: _nomina.LocationArgument = __.absent,

115) -> _CharsetResult:

116 ''' Detects character set candidates with confidence scores. '''

117 if b'' == content:

118 return _CharsetResult( charset = 'utf-8', confidence = 1.0 )

119 for name in behaviors.charset_detectors_order:

120 detector = charset_detectors.get( name )

121 if detector is None: continue

122 result = detector( content, behaviors )

123 if result is NotImplemented: continue

124 break

125 else:

126 match behaviors.charset_on_detect_failure:

127 case _DetectFailureActions.Default:

128 return _CharsetResult( charset = default, confidence = 0.0 )

129 case _:

130 raise _exceptions.CharsetDetectFailure( location = location )

131 if result.charset is None:

132 if __.is_absent( mimetype ): return result

133 if not _mimetypes.is_textual_mimetype( mimetype ): return result

134 result = _charsets.trial_decode_as_confident(

135 content,

136 behaviors = behaviors,

137 supplement = supplement,

138 location = location )

139 return _normalize_charset_detection( content, behaviors, result )

140 return _confirm_charset_detection(

141 content, behaviors, result,

142 supplement = supplement, location = location )

143

144

145def detect_mimetype(

146 content: _nomina.Content, /, *,

147 behaviors: _BehaviorsArgument = _BEHAVIORS_DEFAULT,

148 default: _nomina.MimetypeDefaultArgument = _MIMETYPE_DEFAULT,

149 charset: _nomina.CharsetAssumptionArgument = __.absent,

150 location: _nomina.LocationArgument = __.absent,

151) -> str:

152 ''' Detects most probable MIME type. '''

153 nomargs: __.NominativeArguments = dict(

154 behaviors = behaviors,

155 default = default,

156 charset = charset,

157 location = location )

158 result = detect_mimetype_confidence( content, **nomargs )

159 return result.mimetype

160

161

162def detect_mimetype_confidence(

163 content: _nomina.Content, /, *,

164 behaviors: _BehaviorsArgument = _BEHAVIORS_DEFAULT,

165 default: _nomina.MimetypeDefaultArgument = _MIMETYPE_DEFAULT,

166 charset: _nomina.CharsetAssumptionArgument = __.absent,

167 location: _nomina.LocationArgument = __.absent,

168) -> _MimetypeResult:

169 ''' Detects MIME type candidates with confidence scores. '''

170 if b'' == content:

171 return _MimetypeResult( mimetype = 'text/plain', confidence = 1.0 )

172 for name in behaviors.mimetype_detectors_order:

173 detector = mimetype_detectors.get( name )

174 if detector is None: continue

175 result = detector( content, behaviors )

176 if result is NotImplemented: continue

177 return result

178 if __.is_absent( charset ):

179 match behaviors.mimetype_on_detect_failure:

180 case _DetectFailureActions.Default:

181 return _MimetypeResult( mimetype = default, confidence = 0.0 )

182 case _:

183 raise _exceptions.MimetypeDetectFailure( location = location )

184 return _detect_mimetype_from_charset(

185 content, behaviors, charset, default = default, location = location )

186

187

188def _confirm_charset_detection( # noqa: PLR0911

189 content: _nomina.Content,

190 behaviors: _Behaviors,

191 result: _CharsetResult, /, *,

192 supplement: __.Absential[ str ] = __.absent,

193 location: __.Absential[ _nomina.Location ] = __.absent,

194) -> _CharsetResult:

195 result = _normalize_charset_detection( content, behaviors, result )

196 if result.charset is None: return result # pragma: no cover

197 charset, confidence = result.charset, result.confidence

198 charset = behaviors.charset_promotions.get( charset, charset )

199 if charset.startswith( 'utf-' ):

200 result = _charsets.trial_decode_as_confident(

201 content,

202 behaviors = behaviors,

203 supplement = supplement,

204 inference = charset,

205 confidence = confidence,

206 location = location )

207 return _normalize_charset_detection( content, behaviors, result )

208 result = _CharsetResult( charset = charset, confidence = confidence )

209 match behaviors.trial_decode:

210 case _BehaviorTristate.Never: return result

211 case _: # Shake out false positives, like 'MacRoman'.

212 if charset == _charsets.discover_os_charset_default( ):

213 # Allow 'windows-1252', etc..., as appropriate.

214 return result # pragma: no cover

215 try:

216 _, result_ = _charsets.attempt_decodes(

217 content,

218 behaviors = behaviors,

219 inference = 'utf-8-sig',

220 supplement = supplement,

221 location = location )

222 except _exceptions.ContentDecodeFailure: return result

223 if charset == result_.charset: return result # pragma: no cover

224 return _normalize_charset_detection( content, behaviors, result_ )

225

226

227def _detect_mimetype_from_charset(

228 content: _nomina.Content,

229 behaviors: _Behaviors,

230 charset: str, /, *,

231 default: str,

232 location: __.Absential[ _nomina.Location ],

233) -> _MimetypeResult:

234 should_error = False

235 match behaviors.mimetype_on_detect_failure:

236 case _DetectFailureActions.Default: pass

237 case _: should_error = True

238 error = _exceptions.MimetypeDetectFailure( location = location )

239 result_default = _MimetypeResult( mimetype = default, confidence = 0.0 )

240 match behaviors.trial_decode:

241 case _BehaviorTristate.Never:

242 if should_error: raise error

243 return result_default

244 case _: pass

245 try:

246 text, charset_result = _charsets.attempt_decodes(

247 content,

248 behaviors = behaviors, inference = charset, location = location )

249 except _exceptions.ContentDecodeFailure:

250 if should_error: raise error from None

251 return result_default

252 match behaviors.text_validate:

253 case _BehaviorTristate.Never:

254 if should_error: raise error

255 return result_default

256 case _: pass

257 if not _validation.PROFILE_TEXTUAL( text ):

258 if should_error: raise error

259 return result_default

260 return _MimetypeResult(

261 mimetype = 'text/plain', confidence = charset_result.confidence )

262

263

264def _detect_via_chardet(

265 content: _nomina.Content, behaviors: _Behaviors

266) -> _CharsetResult | __.types.NotImplementedType:

267 try: import chardet # pragma: no cover

268 except ImportError: return NotImplemented # pragma: no cover

269 result_ = chardet.detect( content )

270 charset, confidence = result_[ 'encoding' ], result_[ 'confidence' ]

271 return _CharsetResult( charset = charset, confidence = confidence )

272

273charset_detectors[ 'chardet' ] = _detect_via_chardet

274

275

276def _detect_via_charset_normalizer(

277 content: _nomina.Content, behaviors: _Behaviors

278) -> _CharsetResult | __.types.NotImplementedType:

279 try: import charset_normalizer # pragma: no cover

280 except ImportError: return NotImplemented # pragma: no cover

281 result_ = charset_normalizer.from_bytes( content ).best( )

282 charset = None if result_ is None else result_.encoding # pragma: no cover

283 confidence = _core.confidence_from_bytes_quantity(

284 content, behaviors = behaviors )

285 return _CharsetResult( charset = charset, confidence = confidence )

286

287charset_detectors[ 'charset-normalizer' ] = _detect_via_charset_normalizer

288

289

290def _detect_via_magic(

291 content: _nomina.Content, behaviors: _Behaviors

292) -> _MimetypeResult | __.types.NotImplementedType:

293 try: import magic # pragma: no cover

294 except ImportError: return NotImplemented # pragma: no cover

295 try: mimetype = magic.from_buffer( content, mime = True )

296 except Exception: return NotImplemented # pragma: no cover

297 confidence = _core.confidence_from_bytes_quantity(

298 content, behaviors = behaviors )

299 return _MimetypeResult( mimetype = mimetype, confidence = confidence )

300

301mimetype_detectors[ 'magic' ] = _detect_via_magic

302

303

304def _detect_via_puremagic(

305 content: _nomina.Content, behaviors: _Behaviors

306) -> _MimetypeResult | __.types.NotImplementedType:

307 try: import puremagic # pragma: no cover

308 except ImportError: return NotImplemented # pragma: no cover

309 try: mimetype = puremagic.from_string( content, mime = True )

310 except ( puremagic.PureError, ValueError ): # pragma: no cover

311 return NotImplemented

312 confidence = _core.confidence_from_bytes_quantity(

313 content, behaviors = behaviors )

314 return _MimetypeResult( mimetype = mimetype, confidence = confidence )

315

316mimetype_detectors[ 'puremagic' ] = _detect_via_puremagic

317

318

319def _normalize_charset_detection(

320 content: _nomina.Content, behaviors: _Behaviors, result: _CharsetResult

321) -> _CharsetResult:

322 if result.charset is None: return result # pragma: no cover

323 charset = _charsets.normalize_charset( result.charset )

324 # TODO? Consider endianness variations for BOM.

325 if charset == 'utf-8-sig' and not content.startswith( __.codecs.BOM ):

326 charset = 'utf-8'

327 return _CharsetResult( charset = charset, confidence = result.confidence )