Coverage for sources / detextive / core.py: 100%

47 statements  

« prev     ^ index     » next       coverage.py v7.13.4, created at 2026-02-14 04:38 +0000

1# vim: set filetype=python fileencoding=utf-8: 

2# -*- coding: utf-8 -*- 

3 

4#============================================================================# 

5# # 

6# Licensed under the Apache License, Version 2.0 (the "License"); # 

7# you may not use this file except in compliance with the License. # 

8# You may obtain a copy of the License at # 

9# # 

10# http://www.apache.org/licenses/LICENSE-2.0 # 

11# # 

12# Unless required by applicable law or agreed to in writing, software # 

13# distributed under the License is distributed on an "AS IS" BASIS, # 

14# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # 

15# See the License for the specific language governing permissions and # 

16# limitations under the License. # 

17# # 

18#============================================================================# 

19 

20 

21''' Core types and behaviors. ''' 

22 

23 

24from . import __ 

25from . import exceptions as _exceptions 

26from . import nomina as _nomina 

27 

28 

29CHARSET_DEFAULT = 'utf-8' 

30MIMETYPE_DEFAULT = 'application/octet-stream' 

31 

32 

33class BehaviorTristate( __.enum.Enum ): 

34 ''' When to apply behavior. ''' 

35 

36 Never = __.enum.auto( ) 

37 AsNeeded = __.enum.auto( ) 

38 Always = __.enum.auto( ) 

39 

40 

41class CodecSpecifiers( __.enum.Enum ): 

42 ''' Specifiers for dynamic codecs. ''' 

43 

44 FromInference = __.enum.auto( ) 

45 OsDefault = __.enum.auto( ) 

46 PythonDefault = __.enum.auto( ) 

47 UserSupplement = __.enum.auto( ) 

48 

49 

50class DetectFailureActions( __.enum.Enum ): 

51 ''' Possible responses to detection failure. ''' 

52 

53 Default = __.enum.auto( ) 

54 Error = __.enum.auto( ) 

55 

56 

57class Behaviors( __.immut.DataclassObject ): 

58 ''' How functions behave. ''' 

59 

60 bytes_quantity_confidence_divisor: __.typx.Annotated[ 

61 int, 

62 __.ddoc.Doc( 

63 ''' Minimum number of bytes for full detection confidence. ''' ), 

64 ] = 1024 

65 charset_detect: __.typx.Annotated[ 

66 bool, 

67 __.ddoc.Doc( ''' Whether to detect charset from content. ''' ), 

68 ] = True 

69 charset_detectors_order: __.typx.Annotated[ 

70 __.cabc.Sequence[ str ], 

71 __.ddoc.Doc( 

72 ''' Order in which charset detectors should be applied. ''' ), 

73 ] = ( 'chardet', 'charset-normalizer' ) 

74 charset_on_detect_failure: __.typx.Annotated[ 

75 DetectFailureActions, 

76 __.ddoc.Doc( ''' Action to take on charset detection failure. ''' ), 

77 ] = DetectFailureActions.Default 

78 mimetype_detect: __.typx.Annotated[ 

79 bool, 

80 __.ddoc.Doc( ''' Whether to detect MIME type from content. ''' ), 

81 ] = True 

82 mimetype_detectors_order: __.typx.Annotated[ 

83 __.cabc.Sequence[ str ], 

84 __.ddoc.Doc( 

85 ''' Order in which MIME type detectors should be applied. ''' ), 

86 ] = ( 'magic', 'puremagic' ) 

87 mimetype_on_detect_failure: __.typx.Annotated[ 

88 DetectFailureActions, 

89 __.ddoc.Doc( ''' Action to take on MIME type detection failure. ''' ), 

90 ] = DetectFailureActions.Default 

91 on_decode_error: __.typx.Annotated[ 

92 str, 

93 __.ddoc.Doc( 

94 ''' Response to charset decoding errors. 

95 

96 Standard values are 'ignore', 'replace', and 'strict'. 

97 Can also be any other name which has been registered via 

98 the 'register_error' function in the Python standard library 

99 'codecs' module. 

100 ''' ), 

101 ] = 'strict' 

102 remove_bom: __.typx.Annotated[ 

103 bool, __.ddoc.Doc( ''' Remove byte-ordering mark? ''' ) 

104 ] = True 

105 text_validate: __.typx.Annotated[ 

106 BehaviorTristate, 

107 __.ddoc.Doc( ''' When to validate text. ''' ), 

108 ] = BehaviorTristate.AsNeeded 

109 text_validate_confidence: __.typx.Annotated[ 

110 float, 

111 __.ddoc.Doc( ''' Minimum confidence to skip text validation. ''' ), 

112 ] = 0.80 

113 trial_codecs: __.typx.Annotated[ 

114 __.cabc.Sequence[ str | CodecSpecifiers ], 

115 __.ddoc.Doc( ''' Sequence of codec names or specifiers. ''' ), 

116 ] = ( 

117 CodecSpecifiers.UserSupplement, 

118 'utf-8', 

119 CodecSpecifiers.FromInference, 

120 CodecSpecifiers.OsDefault, 

121 CodecSpecifiers.PythonDefault, 

122 ) 

123 trial_decode: __.typx.Annotated[ 

124 BehaviorTristate, 

125 __.ddoc.Doc( 

126 ''' When to perform trial decode of content with charset. ''' ), 

127 ] = BehaviorTristate.AsNeeded 

128 trial_decode_confidence: __.typx.Annotated[ 

129 float, __.ddoc.Doc( ''' Minimum confidence to skip trial decode. ''') 

130 ] = 0.80 

131 

132 def __post_init__( self ) -> None: 

133 if not isinstance( self.charset_detect, bool ): 

134 raise _exceptions.BehaviorsInvalidity( 

135 'charset_detect', 'a boolean' ) 

136 if not isinstance( self.mimetype_detect, bool ): 

137 raise _exceptions.BehaviorsInvalidity( 

138 'mimetype_detect', 'a boolean' ) 

139 

140 

141BehaviorsArgument: __.typx.TypeAlias = __.typx.Annotated[ 

142 Behaviors, 

143 __.ddoc.Doc( 

144 ''' Configuration for detection and inference behaviors. ''' ), 

145] 

146 

147 

148BEHAVIORS_DEFAULT = Behaviors( ) 

149 

150 

151class CharsetResult( __.immut.DataclassObject ): 

152 ''' Character set encoding with detection confidence. ''' 

153 

154 charset: __.typx.Annotated[ 

155 __.typx.Optional[ str ], 

156 __.ddoc.Doc( 

157 ''' Detected character set encoding. May be ``None``.''' ), 

158 ] 

159 confidence: __.typx.Annotated[ 

160 float, __.ddoc.Doc( ''' Detection confidence from 0.0 to 1.0. ''' ) 

161 ] 

162 

163 

164class MimetypeResult( __.immut.DataclassObject ): 

165 ''' MIME type with detection confidence. ''' 

166 

167 mimetype: __.typx.Annotated[ 

168 str, __.ddoc.Doc( ''' Detected MIME type. ''' ) 

169 ] 

170 confidence: __.typx.Annotated[ 

171 float, __.ddoc.Doc( ''' Detection confidence from 0.0 to 1.0. ''' ) 

172 ] 

173 

174 

175def confidence_from_bytes_quantity( 

176 content: _nomina.Content, behaviors: Behaviors = BEHAVIORS_DEFAULT 

177) -> float: 

178 return min( 

179 1.0, len( content ) / behaviors.bytes_quantity_confidence_divisor )