Coverage for sources/detextive/decoders.py: 100%

1# vim: set filetype=python fileencoding=utf-8:

2# -*- coding: utf-8 -*-

4#============================================================================#

5# #

6# Licensed under the Apache License, Version 2.0 (the "License"); #

7# you may not use this file except in compliance with the License. #

8# You may obtain a copy of the License at #

9# #

10# http://www.apache.org/licenses/LICENSE-2.0 #

11# #

12# Unless required by applicable law or agreed to in writing, software #

13# distributed under the License is distributed on an "AS IS" BASIS, #

14# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. #

15# See the License for the specific language governing permissions and #

16# limitations under the License. #

17# #

18#============================================================================#

21''' Conversion of bytes arrays to Unicode text. '''

24from . import __

25from . import charsets as _charsets

26from . import detectors as _detectors

27from . import exceptions as _exceptions

28from . import inference as _inference

29from . import lineseparators as _lineseparators

30from . import mimetypes as _mimetypes

31from . import nomina as _nomina

32from . import validation as _validation

34from .core import ( # isort: skip

35 BEHAVIORS_DEFAULT as _BEHAVIORS_DEFAULT,

36 BehaviorTristate as _BehaviorTristate,

37 BehaviorsArgument as _BehaviorsArgument,

38 CharsetResult as _CharsetResult,

39 CodecSpecifiers as _CodecSpecifiers,

40 MimetypeResult as _MimetypeResult,

41)

44_MIMETYPE_DEFAULT_TEXTUAL = 'text/plain'

47class DecodeInformResult( __.immut.DataclassObject ):

48 ''' Decoded text with supplemental inference metadata. '''

50 text: __.typx.Annotated[

51 str, __.ddoc.Doc( ''' Decoded text content. ''' )

52 ]

53 charset: __.typx.Annotated[

54 _CharsetResult, __.ddoc.Doc( ''' Charset used for decoding. ''' )

55 ]

56 mimetype: __.typx.Annotated[

57 _MimetypeResult, __.ddoc.Doc( ''' Inferred MIME type metadata. ''' )

58 ]

59 linesep: __.typx.Annotated[

60 __.typx.Optional[ _lineseparators.LineSeparators ],

61 __.ddoc.Doc( ''' Detected line separator from content sample. ''' ),

62 ]

65def decode( # noqa: PLR0913

66 content: _nomina.Content, /, *,

67 behaviors: _BehaviorsArgument = _BEHAVIORS_DEFAULT,

68 profile: _validation.ProfileArgument = _validation.PROFILE_TEXTUAL,

69 http_content_type: _nomina.HttpContentTypeArgument = __.absent,

70 location: _nomina.LocationArgument = __.absent,

71 charset_supplement: _nomina.CharsetSupplementArgument = __.absent,

72) -> str:

73 ''' Decodes bytes array to Unicode text.

75 Uses trial decoding and validation; does not provide default-return

76 semantics. The ``charset_supplement`` parameter is a trial hint and

77 not a fallback return value.

78 '''

79 _, httpct_charset = _parse_http_content_type( http_content_type )

80 return _decode_content_charset_result(

81 content, behaviors, profile,

82 httpct_charset = httpct_charset,

83 location = location,

84 charset_supplement = charset_supplement )[ 0 ]

87def decode_inform( # noqa: PLR0913

88 content: _nomina.Content, /, *,

89 behaviors: _BehaviorsArgument = _BEHAVIORS_DEFAULT,

90 profile: _validation.ProfileArgument = _validation.PROFILE_TEXTUAL,

91 mimetype_default: _nomina.MimetypeDefaultArgument = (

92 _MIMETYPE_DEFAULT_TEXTUAL ),

93 http_content_type: _nomina.HttpContentTypeArgument = __.absent,

94 location: _nomina.LocationArgument = __.absent,

95 charset_supplement: _nomina.CharsetSupplementArgument = __.absent,

96) -> DecodeInformResult:

97 ''' Decodes bytes and returns supplemental inference metadata. '''

98 httpct_mimetype, httpct_charset = (

99 _parse_http_content_type( http_content_type ) )

100 text, charset_result = _decode_content_charset_result(

101 content, behaviors, profile,

102 httpct_charset = httpct_charset,

103 location = location,

104 charset_supplement = charset_supplement )

105 mimetype_result = _infer_mimetype(

106 content, behaviors,

107 mimetype_default = mimetype_default,

108 httpct_mimetype = httpct_mimetype,

109 location = location,

110 charset = charset_result.charset )

111 linesep = _lineseparators.LineSeparators.detect_bytes( content )

112 return DecodeInformResult(

113 text = text,

114 charset = charset_result,

115 mimetype = mimetype_result,

116 linesep = linesep )

117

118

119def _attempt_decode_http_content_type(

120 content: _nomina.Content,

121 behaviors: _BehaviorsArgument,

122 profile: _validation.ProfileArgument, /, *,

123 httpct_charset: __.Absential[ __.typx.Optional[ str ] ],

124 location: _nomina.LocationArgument,

125) -> __.Absential[ tuple[ str, _CharsetResult ] ]:

126 error = _exceptions.ContentDecodeImpossibility( location = location )

127 if httpct_charset is None: raise error

128 if __.is_absent( httpct_charset ): return __.absent

129 behaviors_ = __.dcls.replace(

130 behaviors, trial_codecs = ( _CodecSpecifiers.FromInference, ) )

131 try:

132 text, result = _charsets.attempt_decodes(

133 content,

134 behaviors = behaviors_,

135 inference = httpct_charset,

136 location = location )

137 except _exceptions.ContentDecodeFailure: return __.absent

138 _validate_text(

139 text, result.confidence,

140 behaviors = behaviors, profile = profile, location = location )

141 return text, result

142

143

144def _decode_content_charset_result( # noqa: PLR0913

145 content: _nomina.Content,

146 behaviors: _BehaviorsArgument,

147 profile: _validation.ProfileArgument, /, *,

148 httpct_charset: __.Absential[ __.typx.Optional[ str ] ],

149 location: _nomina.LocationArgument,

150 charset_supplement: _nomina.CharsetSupplementArgument,

151) -> tuple[ str, _CharsetResult ]:

152 if content == b'':

153 return '', _CharsetResult( charset = 'utf-8', confidence = 1.0 )

154 charset: __.Absential[ str ] = __.absent

155 result: __.Absential[ _CharsetResult ] = __.absent

156 httpct_result: __.Absential[ tuple[ str, _CharsetResult ] ] = __.absent

157 httpct_result = _attempt_decode_http_content_type(

158 content, behaviors, profile,

159 httpct_charset = httpct_charset, location = location )

160 if not __.is_absent( httpct_result ): return httpct_result

161 behaviors_ = __.dcls.replace(

162 behaviors, trial_decode = _BehaviorTristate.Never )

163 with __.ctxl.suppress( _exceptions.CharsetDetectFailure ):

164 result = _detectors.detect_charset_confidence(

165 content,

166 behaviors = behaviors_,

167 supplement = charset_supplement,

168 location = location )

169 if ( result.charset

170 and result.confidence >= behaviors.trial_decode_confidence

171 ): charset = result.charset

172 validator = __.funct.partial(

173 _validate_text_in_decode_attempt,

174 behaviors = behaviors,

175 profile = profile,

176 location = location )

177 return _charsets.attempt_decodes(

178 content,

179 behaviors = behaviors,

180 inference = charset,

181 supplement = charset_supplement,

182 location = location,

183 validator = validator )

184

185

186def _infer_mimetype( # noqa: PLR0913

187 content: _nomina.Content,

188 behaviors: _BehaviorsArgument, /, *,

189 mimetype_default: _nomina.MimetypeDefaultArgument,

190 httpct_mimetype: __.Absential[ str ],

191 location: _nomina.LocationArgument,

192 charset: __.typx.Optional[ str ],

193) -> _MimetypeResult:

194 charset_ = __.absent if charset is None else charset

195 if ( not __.is_absent( httpct_mimetype )

196 and _mimetypes.is_textual_mimetype( httpct_mimetype )

197 ):

198 return _MimetypeResult( mimetype = httpct_mimetype, confidence = 0.9 )

199 result: __.Absential[ _MimetypeResult ] = __.absent

200 if not __.is_absent( location ):

201 mimetype = _mimetypes.mimetype_from_location( location )

202 if ( not __.is_absent( mimetype )

203 and _mimetypes.is_textual_mimetype( mimetype )

204 ):

205 return _MimetypeResult( mimetype = mimetype, confidence = 0.9 )

206 if behaviors.mimetype_detect:

207 result = _detectors.detect_mimetype_confidence(

208 content,

209 behaviors = behaviors,

210 default = mimetype_default,

211 charset = charset_,

212 location = location )

213 if __.is_absent( result ):

214 return _MimetypeResult( mimetype = mimetype_default, confidence = 1.0 )

215 if _mimetypes.is_textual_mimetype( result.mimetype ): return result

216 return _MimetypeResult( mimetype = mimetype_default, confidence = 1.0 )

217

218

219def _parse_http_content_type(

220 http_content_type: _nomina.HttpContentTypeArgument

221) -> tuple[ __.Absential[ str ], __.Absential[ __.typx.Optional[ str ] ] ]:

222 if __.is_absent( http_content_type ):

223 return __.absent, __.absent

224 return _inference.parse_http_content_type( http_content_type )

225

226

227def _validate_text(

228 text: str, confidence: float, /, *,

229 behaviors: _BehaviorsArgument,

230 profile: _validation.ProfileArgument,

231 location: _nomina.LocationArgument,

232) -> None:

233 error = _exceptions.TextInvalidity( location = location )

234 should_validate = False

235 match behaviors.text_validate:

236 case _BehaviorTristate.Always:

237 should_validate = True

238 case _BehaviorTristate.AsNeeded:

239 should_validate = confidence < behaviors.text_validate_confidence

240 case _BehaviorTristate.Never: pass

241 if should_validate and not profile( text ): raise error

242

243

244def _validate_text_in_decode_attempt(

245 text: str, result: _CharsetResult, /, *,

246 behaviors: _BehaviorsArgument,

247 profile: _validation.ProfileArgument,

248 location: _nomina.LocationArgument,

249) -> None:

250 _validate_text(

251 text, 0.0,

252 behaviors = behaviors,

253 profile = profile,

254 location = location )

Coverage for sources / detextive / decoders.py: 100%

80 statements