Coverage for sources/detextive/charsets.py: 100%

1# vim: set filetype=python fileencoding=utf-8:

2# -*- coding: utf-8 -*-

4#============================================================================#

5# #

6# Licensed under the Apache License, Version 2.0 (the "License"); #

7# you may not use this file except in compliance with the License. #

8# You may obtain a copy of the License at #

9# #

10# http://www.apache.org/licenses/LICENSE-2.0 #

11# #

12# Unless required by applicable law or agreed to in writing, software #

13# distributed under the License is distributed on an "AS IS" BASIS, #

14# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. #

15# See the License for the specific language governing permissions and #

16# limitations under the License. #

17# #

18#============================================================================#

21''' Management of bytes array decoding via trial character sets. '''

24from . import __

25from . import core as _core

26from . import exceptions as _exceptions

27from . import nomina as _nomina

29from .core import ( # isort: skip

30 BEHAVIORS_DEFAULT as _BEHAVIORS_DEFAULT,

31 BehaviorTristate as _BehaviorTristate,

32 Behaviors as _Behaviors,

33 CharsetResult as _CharsetResult,

34 CodecSpecifiers as _CodecSpecifiers,

35)

38def attempt_decodes( # noqa: PLR0913

39 content: _nomina.Content, /, *,

40 behaviors: _Behaviors = _BEHAVIORS_DEFAULT,

41 inference: __.Absential[ str ] = __.absent,

42 supplement: __.Absential[ str ] = __.absent,

43 location: __.Absential[ _nomina.Location ] = __.absent,

44 validator: __.Absential[

45 __.cabc.Callable[ [ str, _CharsetResult ], None ]

46 ] = __.absent,

47) -> tuple[ str, _CharsetResult ]:

48 ''' Attempts to decode content with various character sets.

50 Will try character sets in the order specified by the trial codecs

51 listed on the behaviors object.

52 '''

53 confidence = _core.confidence_from_bytes_quantity(

54 content, behaviors = behaviors )

55 trial_codecs = _collect_trial_codecs(

56 content,

57 behaviors = behaviors,

58 inference = inference,

59 supplement = supplement )

60 trials: set[ str ] = set( )

61 for trial_codec in trial_codecs:

62 try: text = content.decode(

63 trial_codec, errors = behaviors.on_decode_error )

64 except UnicodeDecodeError: continue

65 finally: trials.add( trial_codec )

66 result = _CharsetResult(

67 charset = normalize_charset_for_content( content, trial_codec ),

68 confidence = confidence )

69 if not __.is_absent( validator ):

70 try: validator( text, result )

71 except _exceptions.TextInvalidity: continue

72 return text, result

73 raise _exceptions.ContentDecodeFailure(

74 charset = tuple( trials ), location = location )

77def discover_os_charset_default( ) -> str:

78 ''' Discovers default character set encoding from operating system. '''

79 discoverer = getattr(

80 __.locale, 'getencoding', __.sys.getfilesystemencoding )

81 return normalize_charset( discoverer( ) )

84def normalize_charset( charset: str, bom_cognizant: bool = False ) -> str:

85 ''' Normalizes character set encoding names. '''

86 charset_ = __.codecs.lookup( charset ).name

87 if bom_cognizant and charset_ == 'utf-8': return 'utf-8-sig'

88 return charset_

91def normalize_charset_for_content(

92 content: _nomina.Content, charset: str

93) -> str:

94 ''' Normalizes charset reporting based on byte-order mark provenance. '''

95 charset_ = normalize_charset( charset )

96 bom_charset = _discover_utf_bom_charset( content )

97 if charset_ in ( 'utf-8', 'utf-8-sig' ):

98 if bom_charset == 'utf-8-sig': return 'utf-8-sig'

99 return 'utf-8'

100 for family in ( 'utf-16', 'utf-32' ):

101 if not charset_.startswith( family ): continue

102 if bom_charset in ( f"{family}-le", f"{family}-be" ):

103 return family

104 return charset_

105 return charset_

106

107

108def trial_decode_as_confident( # noqa: PLR0913

109 content: _nomina.Content, /, *,

110 behaviors: _Behaviors = _BEHAVIORS_DEFAULT,

111 inference: __.Absential[ str ] = __.absent,

112 confidence: float = 0.0,

113 supplement: __.Absential[ str ] = __.absent,

114 location: __.Absential[ _nomina.Location ] = __.absent,

115) -> _CharsetResult:

116 ''' Performs trial decode of content.

117

118 Considers desired trial decode behavior and detection confidence.

119 '''

120 nomargs: __.NominativeArguments = dict(

121 behaviors = behaviors,

122 inference = inference,

123 supplement = supplement,

124 location = location )

125 should_decode = False

126 match behaviors.trial_decode:

127 case _BehaviorTristate.Always: should_decode = True

128 case _BehaviorTristate.AsNeeded:

129 should_decode = confidence < behaviors.trial_decode_confidence

130 case _BehaviorTristate.Never: pass

131 if should_decode:

132 _, result = attempt_decodes( content, **nomargs )

133 return result

134 if __.is_absent( inference ):

135 raise _exceptions.CharsetDetectFailure( location = location )

136 return _CharsetResult( charset = inference, confidence = confidence )

137

138

139def _collect_trial_codecs(

140 content: _nomina.Content, /, *,

141 behaviors: _Behaviors,

142 inference: __.Absential[ str ],

143 supplement: __.Absential[ str ],

144) -> tuple[ str, ... ]:

145 codecs: list[ str ] = [ ] # No set needed; this candidate list is tiny.

146 for codec in behaviors.trial_codecs:

147 charset = _resolve_trial_codec(

148 codec, inference = inference, supplement = supplement )

149 if __.is_absent( charset ): continue

150 charset = normalize_charset( charset )

151 if _is_ambiguous_utf_trial( content, charset, behaviors ): continue

152 if behaviors.remove_bom and charset == 'utf-8': charset = 'utf-8-sig'

153 if charset in codecs: continue

154 codecs.append( charset )

155 return tuple( codecs )

156

157

158def _discover_utf_bom_charset(

159 content: _nomina.Content

160) -> __.typx.Optional[ str ]:

161 # Must check UTF-32 markers first, since they prefix-match UTF-16 markers.

162 if content.startswith( __.codecs.BOM_UTF32_LE ): return 'utf-32-le'

163 if content.startswith( __.codecs.BOM_UTF32_BE ): return 'utf-32-be'

164 if content.startswith( __.codecs.BOM_UTF8 ): return 'utf-8-sig'

165 if content.startswith( __.codecs.BOM_UTF16_LE ): return 'utf-16-le'

166 if content.startswith( __.codecs.BOM_UTF16_BE ): return 'utf-16-be'

167 return None

168

169

170def _is_ambiguous_utf_trial(

171 content: _nomina.Content, charset: str, behaviors: _Behaviors

172) -> bool:

173 if not behaviors.utf_16_32_requires_byte_order: return False

174 match charset:

175 case 'utf-16':

176 bom_charset = _discover_utf_bom_charset( content )

177 return bom_charset not in ( 'utf-16-le', 'utf-16-be' )

178 case 'utf-32':

179 bom_charset = _discover_utf_bom_charset( content )

180 return bom_charset not in ( 'utf-32-le', 'utf-32-be' )

181 case _: return False

182

183

184def _resolve_trial_codec(

185 codec: __.typx.Any, /, *,

186 inference: __.Absential[ str ],

187 supplement: __.Absential[ str ],

188) -> __.Absential[ str ]:

189 charset: __.Absential[ str ] = __.absent

190 match codec:

191 case _CodecSpecifiers.FromInference:

192 if not __.is_absent( inference ): charset = inference

193 case _CodecSpecifiers.OsDefault:

194 charset = discover_os_charset_default( )

195 case _CodecSpecifiers.PythonDefault:

196 charset = __.locale.getpreferredencoding( )

197 case _CodecSpecifiers.UserSupplement:

198 if not __.is_absent( supplement ): charset = supplement

199 case str( ): charset = codec

200 case _: pass

201 return charset

Coverage for sources / detextive / charsets.py: 100%

94 statements