Coverage for sources / detextive / charsets.py: 100%

94 statements  

« prev     ^ index     » next       coverage.py v7.13.4, created at 2026-02-17 06:15 +0000

1# vim: set filetype=python fileencoding=utf-8: 

2# -*- coding: utf-8 -*- 

3 

4#============================================================================# 

5# # 

6# Licensed under the Apache License, Version 2.0 (the "License"); # 

7# you may not use this file except in compliance with the License. # 

8# You may obtain a copy of the License at # 

9# # 

10# http://www.apache.org/licenses/LICENSE-2.0 # 

11# # 

12# Unless required by applicable law or agreed to in writing, software # 

13# distributed under the License is distributed on an "AS IS" BASIS, # 

14# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # 

15# See the License for the specific language governing permissions and # 

16# limitations under the License. # 

17# # 

18#============================================================================# 

19 

20 

21''' Management of bytes array decoding via trial character sets. ''' 

22 

23 

24from . import __ 

25from . import core as _core 

26from . import exceptions as _exceptions 

27from . import nomina as _nomina 

28 

29from .core import ( # isort: skip 

30 BEHAVIORS_DEFAULT as _BEHAVIORS_DEFAULT, 

31 BehaviorTristate as _BehaviorTristate, 

32 Behaviors as _Behaviors, 

33 CharsetResult as _CharsetResult, 

34 CodecSpecifiers as _CodecSpecifiers, 

35) 

36 

37 

38def attempt_decodes( # noqa: PLR0913 

39 content: _nomina.Content, /, *, 

40 behaviors: _Behaviors = _BEHAVIORS_DEFAULT, 

41 inference: __.Absential[ str ] = __.absent, 

42 supplement: __.Absential[ str ] = __.absent, 

43 location: __.Absential[ _nomina.Location ] = __.absent, 

44 validator: __.Absential[ 

45 __.cabc.Callable[ [ str, _CharsetResult ], None ] 

46 ] = __.absent, 

47) -> tuple[ str, _CharsetResult ]: 

48 ''' Attempts to decode content with various character sets. 

49 

50 Will try character sets in the order specified by the trial codecs 

51 listed on the behaviors object. 

52 ''' 

53 confidence = _core.confidence_from_bytes_quantity( 

54 content, behaviors = behaviors ) 

55 trial_codecs = _collect_trial_codecs( 

56 content, 

57 behaviors = behaviors, 

58 inference = inference, 

59 supplement = supplement ) 

60 trials: set[ str ] = set( ) 

61 for trial_codec in trial_codecs: 

62 try: text = content.decode( 

63 trial_codec, errors = behaviors.on_decode_error ) 

64 except UnicodeDecodeError: continue 

65 finally: trials.add( trial_codec ) 

66 result = _CharsetResult( 

67 charset = normalize_charset_for_content( content, trial_codec ), 

68 confidence = confidence ) 

69 if not __.is_absent( validator ): 

70 try: validator( text, result ) 

71 except _exceptions.TextInvalidity: continue 

72 return text, result 

73 raise _exceptions.ContentDecodeFailure( 

74 charset = tuple( trials ), location = location ) 

75 

76 

77def discover_os_charset_default( ) -> str: 

78 ''' Discovers default character set encoding from operating system. ''' 

79 discoverer = getattr( 

80 __.locale, 'getencoding', __.sys.getfilesystemencoding ) 

81 return normalize_charset( discoverer( ) ) 

82 

83 

84def normalize_charset( charset: str, bom_cognizant: bool = False ) -> str: 

85 ''' Normalizes character set encoding names. ''' 

86 charset_ = __.codecs.lookup( charset ).name 

87 if bom_cognizant and charset_ == 'utf-8': return 'utf-8-sig' 

88 return charset_ 

89 

90 

91def normalize_charset_for_content( 

92 content: _nomina.Content, charset: str 

93) -> str: 

94 ''' Normalizes charset reporting based on byte-order mark provenance. ''' 

95 charset_ = normalize_charset( charset ) 

96 bom_charset = _discover_utf_bom_charset( content ) 

97 if charset_ in ( 'utf-8', 'utf-8-sig' ): 

98 if bom_charset == 'utf-8-sig': return 'utf-8-sig' 

99 return 'utf-8' 

100 for family in ( 'utf-16', 'utf-32' ): 

101 if not charset_.startswith( family ): continue 

102 if bom_charset in ( f"{family}-le", f"{family}-be" ): 

103 return family 

104 return charset_ 

105 return charset_ 

106 

107 

108def trial_decode_as_confident( # noqa: PLR0913 

109 content: _nomina.Content, /, *, 

110 behaviors: _Behaviors = _BEHAVIORS_DEFAULT, 

111 inference: __.Absential[ str ] = __.absent, 

112 confidence: float = 0.0, 

113 supplement: __.Absential[ str ] = __.absent, 

114 location: __.Absential[ _nomina.Location ] = __.absent, 

115) -> _CharsetResult: 

116 ''' Performs trial decode of content. 

117 

118 Considers desired trial decode behavior and detection confidence. 

119 ''' 

120 nomargs: __.NominativeArguments = dict( 

121 behaviors = behaviors, 

122 inference = inference, 

123 supplement = supplement, 

124 location = location ) 

125 should_decode = False 

126 match behaviors.trial_decode: 

127 case _BehaviorTristate.Always: should_decode = True 

128 case _BehaviorTristate.AsNeeded: 

129 should_decode = confidence < behaviors.trial_decode_confidence 

130 case _BehaviorTristate.Never: pass 

131 if should_decode: 

132 _, result = attempt_decodes( content, **nomargs ) 

133 return result 

134 if __.is_absent( inference ): 

135 raise _exceptions.CharsetDetectFailure( location = location ) 

136 return _CharsetResult( charset = inference, confidence = confidence ) 

137 

138 

139def _collect_trial_codecs( 

140 content: _nomina.Content, /, *, 

141 behaviors: _Behaviors, 

142 inference: __.Absential[ str ], 

143 supplement: __.Absential[ str ], 

144) -> tuple[ str, ... ]: 

145 codecs: list[ str ] = [ ] # No set needed; this candidate list is tiny. 

146 for codec in behaviors.trial_codecs: 

147 charset = _resolve_trial_codec( 

148 codec, inference = inference, supplement = supplement ) 

149 if __.is_absent( charset ): continue 

150 charset = normalize_charset( charset ) 

151 if _is_ambiguous_utf_trial( content, charset, behaviors ): continue 

152 if behaviors.remove_bom and charset == 'utf-8': charset = 'utf-8-sig' 

153 if charset in codecs: continue 

154 codecs.append( charset ) 

155 return tuple( codecs ) 

156 

157 

158def _discover_utf_bom_charset( 

159 content: _nomina.Content 

160) -> __.typx.Optional[ str ]: 

161 # Must check UTF-32 markers first, since they prefix-match UTF-16 markers. 

162 if content.startswith( __.codecs.BOM_UTF32_LE ): return 'utf-32-le' 

163 if content.startswith( __.codecs.BOM_UTF32_BE ): return 'utf-32-be' 

164 if content.startswith( __.codecs.BOM_UTF8 ): return 'utf-8-sig' 

165 if content.startswith( __.codecs.BOM_UTF16_LE ): return 'utf-16-le' 

166 if content.startswith( __.codecs.BOM_UTF16_BE ): return 'utf-16-be' 

167 return None 

168 

169 

170def _is_ambiguous_utf_trial( 

171 content: _nomina.Content, charset: str, behaviors: _Behaviors 

172) -> bool: 

173 if not behaviors.utf_16_32_requires_byte_order: return False 

174 match charset: 

175 case 'utf-16': 

176 bom_charset = _discover_utf_bom_charset( content ) 

177 return bom_charset not in ( 'utf-16-le', 'utf-16-be' ) 

178 case 'utf-32': 

179 bom_charset = _discover_utf_bom_charset( content ) 

180 return bom_charset not in ( 'utf-32-le', 'utf-32-be' ) 

181 case _: return False 

182 

183 

184def _resolve_trial_codec( 

185 codec: __.typx.Any, /, *, 

186 inference: __.Absential[ str ], 

187 supplement: __.Absential[ str ], 

188) -> __.Absential[ str ]: 

189 charset: __.Absential[ str ] = __.absent 

190 match codec: 

191 case _CodecSpecifiers.FromInference: 

192 if not __.is_absent( inference ): charset = inference 

193 case _CodecSpecifiers.OsDefault: 

194 charset = discover_os_charset_default( ) 

195 case _CodecSpecifiers.PythonDefault: 

196 charset = __.locale.getpreferredencoding( ) 

197 case _CodecSpecifiers.UserSupplement: 

198 if not __.is_absent( supplement ): charset = supplement 

199 case str( ): charset = codec 

200 case _: pass 

201 return charset