Coverage for sources/detextive/charsets.py: 100%

1# vim: set filetype=python fileencoding=utf-8:

2# -*- coding: utf-8 -*-

4#============================================================================#

5# #

6# Licensed under the Apache License, Version 2.0 (the "License"); #

7# you may not use this file except in compliance with the License. #

8# You may obtain a copy of the License at #

9# #

10# http://www.apache.org/licenses/LICENSE-2.0 #

11# #

12# Unless required by applicable law or agreed to in writing, software #

13# distributed under the License is distributed on an "AS IS" BASIS, #

14# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. #

15# See the License for the specific language governing permissions and #

16# limitations under the License. #

17# #

18#============================================================================#

21''' Management of bytes array decoding via trial character sets. '''

24from . import __

25from . import core as _core

26from . import exceptions as _exceptions

27from . import nomina as _nomina

29from .core import ( # isort: skip

30 BEHAVIORS_DEFAULT as _BEHAVIORS_DEFAULT,

31 BehaviorTristate as _BehaviorTristate,

32 Behaviors as _Behaviors,

33 CharsetResult as _CharsetResult,

34 CodecSpecifiers as _CodecSpecifiers,

35)

38def attempt_decodes( # noqa: C901,PLR0912,PLR0913,PLR0915

39 content: _nomina.Content, /, *,

40 behaviors: _Behaviors = _BEHAVIORS_DEFAULT,

41 inference: __.Absential[ str ] = __.absent,

42 supplement: __.Absential[ str ] = __.absent,

43 location: __.Absential[ _nomina.Location ] = __.absent,

44 validator: __.Absential[

45 __.cabc.Callable[ [ str, _CharsetResult ], None ]

46 ] = __.absent,

47) -> tuple[ str, _CharsetResult ]:

48 ''' Attempts to decode content with various character sets.

50 Will try character sets in the order specified by the trial codecs

51 listed on the behaviors object.

52 '''

53 confidence = _core.confidence_from_bytes_quantity(

54 content, behaviors = behaviors )

55 on_decode_error = behaviors.on_decode_error

56 trials: set[ str ] = set( )

57 for codec in behaviors.trial_codecs:

58 match codec:

59 case _CodecSpecifiers.FromInference:

60 if __.is_absent( inference ): continue

61 charset = inference

62 case _CodecSpecifiers.OsDefault:

63 charset = discover_os_charset_default( )

64 case _CodecSpecifiers.PythonDefault:

65 charset = __.locale.getpreferredencoding( )

66 case _CodecSpecifiers.UserSupplement:

67 if __.is_absent( supplement ): continue

68 charset = supplement

69 case str( ): charset = codec

70 case _: continue

71 charset = normalize_charset(

72 charset, bom_cognizant = behaviors.remove_bom )

73 if charset in trials: continue

74 try: text = content.decode( charset, errors = on_decode_error )

75 except UnicodeDecodeError: continue

76 finally: trials.add( charset )

77 result = _CharsetResult( charset = charset, confidence = confidence )

78 if not __.is_absent( validator ):

79 try: validator( text, result )

80 except _exceptions.TextInvalidity: continue

81 return text, result

82 raise _exceptions.ContentDecodeFailure(

83 charset = tuple( trials ), location = location )

86def discover_os_charset_default( ) -> str:

87 ''' Discovers default character set encoding from operating system. '''

88 discoverer = getattr(

89 __.locale, 'getencoding', __.sys.getfilesystemencoding )

90 return normalize_charset( discoverer( ) )

93def normalize_charset( charset: str, bom_cognizant: bool = False ) -> str:

94 ''' Normalizes character set encoding names. '''

95 charset_ = __.codecs.lookup( charset ).name

96 if bom_cognizant and charset_ == 'utf-8': return 'utf-8-sig'

97 return charset_

100def trial_decode_as_confident( # noqa: PLR0913

101 content: _nomina.Content, /, *,

102 behaviors: _Behaviors = _BEHAVIORS_DEFAULT,

103 inference: __.Absential[ str ] = __.absent,

104 confidence: float = 0.0,

105 supplement: __.Absential[ str ] = __.absent,

106 location: __.Absential[ _nomina.Location ] = __.absent,

107) -> _CharsetResult:

108 ''' Performs trial decode of content.

109

110 Considers desired trial decode behavior and detection confidence.

111 '''

112 nomargs: __.NominativeArguments = dict(

113 behaviors = behaviors,

114 inference = inference,

115 supplement = supplement,

116 location = location )

117 should_decode = False

118 match behaviors.trial_decode:

119 case _BehaviorTristate.Always: should_decode = True

120 case _BehaviorTristate.AsNeeded:

121 should_decode = confidence < behaviors.trial_decode_confidence

122 case _BehaviorTristate.Never: pass

123 if should_decode:

124 _, result = attempt_decodes( content, **nomargs )

125 return result

126 if __.is_absent( inference ):

127 raise _exceptions.CharsetDetectFailure( location = location )

128 return _CharsetResult( charset = inference, confidence = confidence )

Coverage for sources / detextive / charsets.py: 100%

55 statements