Coverage for sources / detextive / decoders.py: 100%

80 statements  

« prev     ^ index     » next       coverage.py v7.13.4, created at 2026-02-17 06:15 +0000

1# vim: set filetype=python fileencoding=utf-8: 

2# -*- coding: utf-8 -*- 

3 

4#============================================================================# 

5# # 

6# Licensed under the Apache License, Version 2.0 (the "License"); # 

7# you may not use this file except in compliance with the License. # 

8# You may obtain a copy of the License at # 

9# # 

10# http://www.apache.org/licenses/LICENSE-2.0 # 

11# # 

12# Unless required by applicable law or agreed to in writing, software # 

13# distributed under the License is distributed on an "AS IS" BASIS, # 

14# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # 

15# See the License for the specific language governing permissions and # 

16# limitations under the License. # 

17# # 

18#============================================================================# 

19 

20 

21''' Conversion of bytes arrays to Unicode text. ''' 

22 

23 

24from . import __ 

25from . import charsets as _charsets 

26from . import detectors as _detectors 

27from . import exceptions as _exceptions 

28from . import inference as _inference 

29from . import lineseparators as _lineseparators 

30from . import mimetypes as _mimetypes 

31from . import nomina as _nomina 

32from . import validation as _validation 

33 

34from .core import ( # isort: skip 

35 BEHAVIORS_DEFAULT as _BEHAVIORS_DEFAULT, 

36 BehaviorTristate as _BehaviorTristate, 

37 BehaviorsArgument as _BehaviorsArgument, 

38 CharsetResult as _CharsetResult, 

39 CodecSpecifiers as _CodecSpecifiers, 

40 MimetypeResult as _MimetypeResult, 

41) 

42 

43 

44_MIMETYPE_DEFAULT_TEXTUAL = 'text/plain' 

45 

46 

47class DecodeInformResult( __.immut.DataclassObject ): 

48 ''' Decoded text with supplemental inference metadata. ''' 

49 

50 text: __.typx.Annotated[ 

51 str, __.ddoc.Doc( ''' Decoded text content. ''' ) 

52 ] 

53 charset: __.typx.Annotated[ 

54 _CharsetResult, __.ddoc.Doc( ''' Charset used for decoding. ''' ) 

55 ] 

56 mimetype: __.typx.Annotated[ 

57 _MimetypeResult, __.ddoc.Doc( ''' Inferred MIME type metadata. ''' ) 

58 ] 

59 linesep: __.typx.Annotated[ 

60 __.typx.Optional[ _lineseparators.LineSeparators ], 

61 __.ddoc.Doc( ''' Detected line separator from content sample. ''' ), 

62 ] 

63 

64 

65def decode( # noqa: PLR0913 

66 content: _nomina.Content, /, *, 

67 behaviors: _BehaviorsArgument = _BEHAVIORS_DEFAULT, 

68 profile: _validation.ProfileArgument = _validation.PROFILE_TEXTUAL, 

69 http_content_type: _nomina.HttpContentTypeArgument = __.absent, 

70 location: _nomina.LocationArgument = __.absent, 

71 charset_supplement: _nomina.CharsetSupplementArgument = __.absent, 

72) -> str: 

73 ''' Decodes bytes array to Unicode text. 

74 

75 Uses trial decoding and validation; does not provide default-return 

76 semantics. The ``charset_supplement`` parameter is a trial hint and 

77 not a fallback return value. 

78 ''' 

79 _, httpct_charset = _parse_http_content_type( http_content_type ) 

80 return _decode_content_charset_result( 

81 content, behaviors, profile, 

82 httpct_charset = httpct_charset, 

83 location = location, 

84 charset_supplement = charset_supplement )[ 0 ] 

85 

86 

87def decode_inform( # noqa: PLR0913 

88 content: _nomina.Content, /, *, 

89 behaviors: _BehaviorsArgument = _BEHAVIORS_DEFAULT, 

90 profile: _validation.ProfileArgument = _validation.PROFILE_TEXTUAL, 

91 mimetype_default: _nomina.MimetypeDefaultArgument = ( 

92 _MIMETYPE_DEFAULT_TEXTUAL ), 

93 http_content_type: _nomina.HttpContentTypeArgument = __.absent, 

94 location: _nomina.LocationArgument = __.absent, 

95 charset_supplement: _nomina.CharsetSupplementArgument = __.absent, 

96) -> DecodeInformResult: 

97 ''' Decodes bytes and returns supplemental inference metadata. ''' 

98 httpct_mimetype, httpct_charset = ( 

99 _parse_http_content_type( http_content_type ) ) 

100 text, charset_result = _decode_content_charset_result( 

101 content, behaviors, profile, 

102 httpct_charset = httpct_charset, 

103 location = location, 

104 charset_supplement = charset_supplement ) 

105 mimetype_result = _infer_mimetype( 

106 content, behaviors, 

107 mimetype_default = mimetype_default, 

108 httpct_mimetype = httpct_mimetype, 

109 location = location, 

110 charset = charset_result.charset ) 

111 linesep = _lineseparators.LineSeparators.detect_bytes( content ) 

112 return DecodeInformResult( 

113 text = text, 

114 charset = charset_result, 

115 mimetype = mimetype_result, 

116 linesep = linesep ) 

117 

118 

119def _attempt_decode_http_content_type( 

120 content: _nomina.Content, 

121 behaviors: _BehaviorsArgument, 

122 profile: _validation.ProfileArgument, /, *, 

123 httpct_charset: __.Absential[ __.typx.Optional[ str ] ], 

124 location: _nomina.LocationArgument, 

125) -> __.Absential[ tuple[ str, _CharsetResult ] ]: 

126 error = _exceptions.ContentDecodeImpossibility( location = location ) 

127 if httpct_charset is None: raise error 

128 if __.is_absent( httpct_charset ): return __.absent 

129 behaviors_ = __.dcls.replace( 

130 behaviors, trial_codecs = ( _CodecSpecifiers.FromInference, ) ) 

131 try: 

132 text, result = _charsets.attempt_decodes( 

133 content, 

134 behaviors = behaviors_, 

135 inference = httpct_charset, 

136 location = location ) 

137 except _exceptions.ContentDecodeFailure: return __.absent 

138 _validate_text( 

139 text, result.confidence, 

140 behaviors = behaviors, profile = profile, location = location ) 

141 return text, result 

142 

143 

144def _decode_content_charset_result( # noqa: PLR0913 

145 content: _nomina.Content, 

146 behaviors: _BehaviorsArgument, 

147 profile: _validation.ProfileArgument, /, *, 

148 httpct_charset: __.Absential[ __.typx.Optional[ str ] ], 

149 location: _nomina.LocationArgument, 

150 charset_supplement: _nomina.CharsetSupplementArgument, 

151) -> tuple[ str, _CharsetResult ]: 

152 if content == b'': 

153 return '', _CharsetResult( charset = 'utf-8', confidence = 1.0 ) 

154 charset: __.Absential[ str ] = __.absent 

155 result: __.Absential[ _CharsetResult ] = __.absent 

156 httpct_result: __.Absential[ tuple[ str, _CharsetResult ] ] = __.absent 

157 httpct_result = _attempt_decode_http_content_type( 

158 content, behaviors, profile, 

159 httpct_charset = httpct_charset, location = location ) 

160 if not __.is_absent( httpct_result ): return httpct_result 

161 behaviors_ = __.dcls.replace( 

162 behaviors, trial_decode = _BehaviorTristate.Never ) 

163 with __.ctxl.suppress( _exceptions.CharsetDetectFailure ): 

164 result = _detectors.detect_charset_confidence( 

165 content, 

166 behaviors = behaviors_, 

167 supplement = charset_supplement, 

168 location = location ) 

169 if ( result.charset 

170 and result.confidence >= behaviors.trial_decode_confidence 

171 ): charset = result.charset 

172 validator = __.funct.partial( 

173 _validate_text_in_decode_attempt, 

174 behaviors = behaviors, 

175 profile = profile, 

176 location = location ) 

177 return _charsets.attempt_decodes( 

178 content, 

179 behaviors = behaviors, 

180 inference = charset, 

181 supplement = charset_supplement, 

182 location = location, 

183 validator = validator ) 

184 

185 

186def _infer_mimetype( # noqa: PLR0913 

187 content: _nomina.Content, 

188 behaviors: _BehaviorsArgument, /, *, 

189 mimetype_default: _nomina.MimetypeDefaultArgument, 

190 httpct_mimetype: __.Absential[ str ], 

191 location: _nomina.LocationArgument, 

192 charset: __.typx.Optional[ str ], 

193) -> _MimetypeResult: 

194 charset_ = __.absent if charset is None else charset 

195 if ( not __.is_absent( httpct_mimetype ) 

196 and _mimetypes.is_textual_mimetype( httpct_mimetype ) 

197 ): 

198 return _MimetypeResult( mimetype = httpct_mimetype, confidence = 0.9 ) 

199 result: __.Absential[ _MimetypeResult ] = __.absent 

200 if not __.is_absent( location ): 

201 mimetype = _mimetypes.mimetype_from_location( location ) 

202 if ( not __.is_absent( mimetype ) 

203 and _mimetypes.is_textual_mimetype( mimetype ) 

204 ): 

205 return _MimetypeResult( mimetype = mimetype, confidence = 0.9 ) 

206 if behaviors.mimetype_detect: 

207 result = _detectors.detect_mimetype_confidence( 

208 content, 

209 behaviors = behaviors, 

210 default = mimetype_default, 

211 charset = charset_, 

212 location = location ) 

213 if __.is_absent( result ): 

214 return _MimetypeResult( mimetype = mimetype_default, confidence = 1.0 ) 

215 if _mimetypes.is_textual_mimetype( result.mimetype ): return result 

216 return _MimetypeResult( mimetype = mimetype_default, confidence = 1.0 ) 

217 

218 

219def _parse_http_content_type( 

220 http_content_type: _nomina.HttpContentTypeArgument 

221) -> tuple[ __.Absential[ str ], __.Absential[ __.typx.Optional[ str ] ] ]: 

222 if __.is_absent( http_content_type ): 

223 return __.absent, __.absent 

224 return _inference.parse_http_content_type( http_content_type ) 

225 

226 

227def _validate_text( 

228 text: str, confidence: float, /, *, 

229 behaviors: _BehaviorsArgument, 

230 profile: _validation.ProfileArgument, 

231 location: _nomina.LocationArgument, 

232) -> None: 

233 error = _exceptions.TextInvalidity( location = location ) 

234 should_validate = False 

235 match behaviors.text_validate: 

236 case _BehaviorTristate.Always: 

237 should_validate = True 

238 case _BehaviorTristate.AsNeeded: 

239 should_validate = confidence < behaviors.text_validate_confidence 

240 case _BehaviorTristate.Never: pass 

241 if should_validate and not profile( text ): raise error 

242 

243 

244def _validate_text_in_decode_attempt( 

245 text: str, result: _CharsetResult, /, *, 

246 behaviors: _BehaviorsArgument, 

247 profile: _validation.ProfileArgument, 

248 location: _nomina.LocationArgument, 

249) -> None: 

250 _validate_text( 

251 text, 0.0, 

252 behaviors = behaviors, 

253 profile = profile, 

254 location = location )