Coverage for sources / detextive / charsets.py: 100%

55 statements  

« prev     ^ index     » next       coverage.py v7.13.4, created at 2026-02-14 04:38 +0000

1# vim: set filetype=python fileencoding=utf-8: 

2# -*- coding: utf-8 -*- 

3 

4#============================================================================# 

5# # 

6# Licensed under the Apache License, Version 2.0 (the "License"); # 

7# you may not use this file except in compliance with the License. # 

8# You may obtain a copy of the License at # 

9# # 

10# http://www.apache.org/licenses/LICENSE-2.0 # 

11# # 

12# Unless required by applicable law or agreed to in writing, software # 

13# distributed under the License is distributed on an "AS IS" BASIS, # 

14# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # 

15# See the License for the specific language governing permissions and # 

16# limitations under the License. # 

17# # 

18#============================================================================# 

19 

20 

21''' Management of bytes array decoding via trial character sets. ''' 

22 

23 

24from . import __ 

25from . import core as _core 

26from . import exceptions as _exceptions 

27from . import nomina as _nomina 

28 

29from .core import ( # isort: skip 

30 BEHAVIORS_DEFAULT as _BEHAVIORS_DEFAULT, 

31 BehaviorTristate as _BehaviorTristate, 

32 Behaviors as _Behaviors, 

33 CharsetResult as _CharsetResult, 

34 CodecSpecifiers as _CodecSpecifiers, 

35) 

36 

37 

38def attempt_decodes( # noqa: C901,PLR0912,PLR0913,PLR0915 

39 content: _nomina.Content, /, *, 

40 behaviors: _Behaviors = _BEHAVIORS_DEFAULT, 

41 inference: __.Absential[ str ] = __.absent, 

42 supplement: __.Absential[ str ] = __.absent, 

43 location: __.Absential[ _nomina.Location ] = __.absent, 

44 validator: __.Absential[ 

45 __.cabc.Callable[ [ str, _CharsetResult ], None ] 

46 ] = __.absent, 

47) -> tuple[ str, _CharsetResult ]: 

48 ''' Attempts to decode content with various character sets. 

49 

50 Will try character sets in the order specified by the trial codecs 

51 listed on the behaviors object. 

52 ''' 

53 confidence = _core.confidence_from_bytes_quantity( 

54 content, behaviors = behaviors ) 

55 on_decode_error = behaviors.on_decode_error 

56 trials: set[ str ] = set( ) 

57 for codec in behaviors.trial_codecs: 

58 match codec: 

59 case _CodecSpecifiers.FromInference: 

60 if __.is_absent( inference ): continue 

61 charset = inference 

62 case _CodecSpecifiers.OsDefault: 

63 charset = discover_os_charset_default( ) 

64 case _CodecSpecifiers.PythonDefault: 

65 charset = __.locale.getpreferredencoding( ) 

66 case _CodecSpecifiers.UserSupplement: 

67 if __.is_absent( supplement ): continue 

68 charset = supplement 

69 case str( ): charset = codec 

70 case _: continue 

71 charset = normalize_charset( 

72 charset, bom_cognizant = behaviors.remove_bom ) 

73 if charset in trials: continue 

74 try: text = content.decode( charset, errors = on_decode_error ) 

75 except UnicodeDecodeError: continue 

76 finally: trials.add( charset ) 

77 result = _CharsetResult( charset = charset, confidence = confidence ) 

78 if not __.is_absent( validator ): 

79 try: validator( text, result ) 

80 except _exceptions.TextInvalidity: continue 

81 return text, result 

82 raise _exceptions.ContentDecodeFailure( 

83 charset = tuple( trials ), location = location ) 

84 

85 

86def discover_os_charset_default( ) -> str: 

87 ''' Discovers default character set encoding from operating system. ''' 

88 discoverer = getattr( 

89 __.locale, 'getencoding', __.sys.getfilesystemencoding ) 

90 return normalize_charset( discoverer( ) ) 

91 

92 

93def normalize_charset( charset: str, bom_cognizant: bool = False ) -> str: 

94 ''' Normalizes character set encoding names. ''' 

95 charset_ = __.codecs.lookup( charset ).name 

96 if bom_cognizant and charset_ == 'utf-8': return 'utf-8-sig' 

97 return charset_ 

98 

99 

100def trial_decode_as_confident( # noqa: PLR0913 

101 content: _nomina.Content, /, *, 

102 behaviors: _Behaviors = _BEHAVIORS_DEFAULT, 

103 inference: __.Absential[ str ] = __.absent, 

104 confidence: float = 0.0, 

105 supplement: __.Absential[ str ] = __.absent, 

106 location: __.Absential[ _nomina.Location ] = __.absent, 

107) -> _CharsetResult: 

108 ''' Performs trial decode of content. 

109 

110 Considers desired trial decode behavior and detection confidence. 

111 ''' 

112 nomargs: __.NominativeArguments = dict( 

113 behaviors = behaviors, 

114 inference = inference, 

115 supplement = supplement, 

116 location = location ) 

117 should_decode = False 

118 match behaviors.trial_decode: 

119 case _BehaviorTristate.Always: should_decode = True 

120 case _BehaviorTristate.AsNeeded: 

121 should_decode = confidence < behaviors.trial_decode_confidence 

122 case _BehaviorTristate.Never: pass 

123 if should_decode: 

124 _, result = attempt_decodes( content, **nomargs ) 

125 return result 

126 if __.is_absent( inference ): 

127 raise _exceptions.CharsetDetectFailure( location = location ) 

128 return _CharsetResult( charset = inference, confidence = confidence )