Coverage for sources/detextive/charsets.py: 100%

49 statements  

« prev     ^ index     » next       coverage.py v7.10.6, created at 2025-09-20 18:02 +0000

1# vim: set filetype=python fileencoding=utf-8: 

2# -*- coding: utf-8 -*- 

3 

4#============================================================================# 

5# # 

6# Licensed under the Apache License, Version 2.0 (the "License"); # 

7# you may not use this file except in compliance with the License. # 

8# You may obtain a copy of the License at # 

9# # 

10# http://www.apache.org/licenses/LICENSE-2.0 # 

11# # 

12# Unless required by applicable law or agreed to in writing, software # 

13# distributed under the License is distributed on an "AS IS" BASIS, # 

14# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # 

15# See the License for the specific language governing permissions and # 

16# limitations under the License. # 

17# # 

18#============================================================================# 

19 

20 

21''' Management of bytes array decoding via trial character sets. ''' 

22 

23 

24from . import __ 

25from . import core as _core 

26from . import exceptions as _exceptions 

27from . import nomina as _nomina 

28 

29from .core import ( # isort: skip 

30 BEHAVIORS_DEFAULT as _BEHAVIORS_DEFAULT, 

31 BehaviorTristate as _BehaviorTristate, 

32 Behaviors as _Behaviors, 

33 CharsetResult as _CharsetResult, 

34 CodecSpecifiers as _CodecSpecifiers, 

35) 

36 

37 

38def attempt_decodes( 

39 content: _nomina.Content, /, *, 

40 behaviors: _Behaviors = _BEHAVIORS_DEFAULT, 

41 inference: __.Absential[ str ] = __.absent, 

42 supplement: __.Absential[ str ] = __.absent, 

43 location: __.Absential[ _nomina.Location ] = __.absent, 

44) -> tuple[ str, _CharsetResult ]: 

45 ''' Attempts to decode content with various character sets. 

46 

47 Will try character sets in the order specified by the trial codecs 

48 listed on the behaviors object. 

49 ''' 

50 confidence = _core.confidence_from_bytes_quantity( 

51 content, behaviors = behaviors ) 

52 on_decode_error = behaviors.on_decode_error 

53 trials: list[ str ] = [ ] 

54 for codec in behaviors.trial_codecs: 

55 match codec: 

56 case _CodecSpecifiers.FromInference: 

57 if __.is_absent( inference ): continue 

58 charset = inference 

59 case _CodecSpecifiers.OsDefault: 

60 charset = discover_os_charset_default( ) 

61 case _CodecSpecifiers.PythonDefault: 

62 charset = __.locale.getpreferredencoding( ) 

63 case _CodecSpecifiers.UserSupplement: 

64 if __.is_absent( supplement ): continue 

65 charset = supplement 

66 case str( ): charset = codec 

67 case _: continue 

68 try: text = content.decode( charset, errors = on_decode_error ) 

69 except UnicodeDecodeError: 

70 trials.append( charset ) 

71 continue 

72 result = _CharsetResult( charset = charset, confidence = confidence ) 

73 return text, result 

74 raise _exceptions.ContentDecodeFailure( 

75 charset = trials, location = location ) 

76 

77 

78def discover_os_charset_default( ) -> str: 

79 ''' Discovers default character set encoding from operating system. ''' 

80 discoverer = getattr( 

81 __.locale, 'getencoding', __.sys.getfilesystemencoding ) 

82 return normalize_charset( discoverer( ) ) 

83 

84 

85def normalize_charset( charset: str ) -> str: 

86 ''' Normalizes character set encoding names. ''' 

87 return __.codecs.lookup( charset ).name 

88 

89 

90def trial_decode_as_confident( # noqa: PLR0913 

91 content: _nomina.Content, /, *, 

92 behaviors: _Behaviors = _BEHAVIORS_DEFAULT, 

93 inference: __.Absential[ str ] = __.absent, 

94 confidence: float = 0.0, 

95 supplement: __.Absential[ str ] = __.absent, 

96 location: __.Absential[ _nomina.Location ] = __.absent, 

97) -> _CharsetResult: 

98 ''' Performs trial decode of content. 

99 

100 Considers desired trial decode behavior and detection confidence. 

101 ''' 

102 nomargs: __.NominativeArguments = dict( 

103 behaviors = behaviors, 

104 inference = inference, 

105 supplement = supplement, 

106 location = location ) 

107 should_decode = False 

108 match behaviors.trial_decode: 

109 case _BehaviorTristate.Always: should_decode = True 

110 case _BehaviorTristate.AsNeeded: 

111 should_decode = confidence < behaviors.trial_decode_confidence 

112 case _BehaviorTristate.Never: pass 

113 if should_decode: 

114 _, result = attempt_decodes( content, **nomargs ) 

115 return result 

116 if __.is_absent( inference ): 

117 raise _exceptions.CharsetDetectFailure( location = location ) 

118 return _CharsetResult( charset = inference, confidence = confidence )