Coverage for sources/detextive/core.py: 100%

42 statements  

« prev     ^ index     » next       coverage.py v7.10.6, created at 2025-09-20 18:02 +0000

1# vim: set filetype=python fileencoding=utf-8: 

2# -*- coding: utf-8 -*- 

3 

4#============================================================================# 

5# # 

6# Licensed under the Apache License, Version 2.0 (the "License"); # 

7# you may not use this file except in compliance with the License. # 

8# You may obtain a copy of the License at # 

9# # 

10# http://www.apache.org/licenses/LICENSE-2.0 # 

11# # 

12# Unless required by applicable law or agreed to in writing, software # 

13# distributed under the License is distributed on an "AS IS" BASIS, # 

14# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # 

15# See the License for the specific language governing permissions and # 

16# limitations under the License. # 

17# # 

18#============================================================================# 

19 

20 

21''' Core types and behaviors. ''' 

22 

23 

24from . import __ 

25from . import nomina as _nomina 

26 

27 

28_STANDARD_CHARSET_PROMOTIONS = ( 

29 ( 'ascii', 'utf-8-sig' ), 

30 ( 'utf-8', 'utf-8-sig' ), 

31) 

32 

33 

34CHARSET_DEFAULT = 'utf-8' 

35MIMETYPE_DEFAULT = 'application/octet-stream' 

36 

37 

38class BehaviorTristate( __.enum.Enum ): 

39 ''' When to apply behavior. ''' 

40 

41 Never = __.enum.auto( ) 

42 AsNeeded = __.enum.auto( ) 

43 Always = __.enum.auto( ) 

44 

45 

46class CodecSpecifiers( __.enum.Enum ): 

47 ''' Specifiers for dynamic codecs. ''' 

48 

49 FromInference = __.enum.auto( ) 

50 OsDefault = __.enum.auto( ) 

51 PythonDefault = __.enum.auto( ) 

52 UserSupplement = __.enum.auto( ) 

53 

54 

55class DetectFailureActions( __.enum.Enum ): 

56 ''' Possible responses to detection failure. ''' 

57 

58 Default = __.enum.auto( ) 

59 Error = __.enum.auto( ) 

60 

61 

62class Behaviors( __.immut.DataclassObject ): 

63 ''' How functions behave. ''' 

64 

65 bytes_quantity_confidence_divisor: __.typx.Annotated[ 

66 int, 

67 __.ddoc.Doc( 

68 ''' Minimum number of bytes for full detection confidence. ''' ), 

69 ] = 1024 

70 charset_detect: __.typx.Annotated[ 

71 BehaviorTristate, 

72 __.ddoc.Doc( ''' When to detect charset from content. ''' ), 

73 ] = BehaviorTristate.AsNeeded 

74 charset_detectors_order: __.typx.Annotated[ 

75 __.cabc.Sequence[ str ], 

76 __.ddoc.Doc( 

77 ''' Order in which charset detectors should be applied. ''' ), 

78 ] = ( 'chardet', 'charset-normalizer' ) 

79 charset_on_detect_failure: __.typx.Annotated[ 

80 DetectFailureActions, 

81 __.ddoc.Doc( ''' Action to take on charset detection failure. ''' ), 

82 ] = DetectFailureActions.Default 

83 charset_promotions: __.typx.Annotated[ 

84 __.cabc.Mapping[ str, str ], 

85 __.ddoc.Doc( 

86 ''' Which detected charsets to promote to other charsets. 

87 

88 E.g., 7-bit ASCII to UTF-8. 

89 ''' ), 

90 ] = __.dcls.field( 

91 default_factory = ( 

92 lambda: __.immut.Dictionary( _STANDARD_CHARSET_PROMOTIONS ) ) ) 

93 mimetype_detect: __.typx.Annotated[ 

94 BehaviorTristate, 

95 __.ddoc.Doc( ''' When to detect MIME type from content. ''' ), 

96 ] = BehaviorTristate.AsNeeded 

97 mimetype_detectors_order: __.typx.Annotated[ 

98 __.cabc.Sequence[ str ], 

99 __.ddoc.Doc( 

100 ''' Order in which MIME type detectors should be applied. ''' ), 

101 ] = ( 'magic', 'puremagic' ) 

102 mimetype_on_detect_failure: __.typx.Annotated[ 

103 DetectFailureActions, 

104 __.ddoc.Doc( ''' Action to take on MIME type detection failure. ''' ), 

105 ] = DetectFailureActions.Default 

106 on_decode_error: __.typx.Annotated[ 

107 str, 

108 __.ddoc.Doc( 

109 ''' Response to charset decoding errors. 

110 

111 Standard values are 'ignore', 'replace', and 'strict'. 

112 Can also be any other name which has been registered via 

113 the 'register_error' function in the Python standard library 

114 'codecs' module. 

115 ''' ), 

116 ] = 'strict' 

117 text_validate: __.typx.Annotated[ 

118 BehaviorTristate, 

119 __.ddoc.Doc( ''' When to validate text. ''' ), 

120 ] = BehaviorTristate.AsNeeded 

121 text_validate_confidence: __.typx.Annotated[ 

122 float, 

123 __.ddoc.Doc( ''' Minimum confidence to skip text validation. ''' ), 

124 ] = 0.80 

125 trial_codecs: __.typx.Annotated[ 

126 __.cabc.Sequence[ str | CodecSpecifiers ], 

127 __.ddoc.Doc( ''' Sequence of codec names or specifiers. ''' ), 

128 ] = ( CodecSpecifiers.FromInference, CodecSpecifiers.UserSupplement ) 

129 trial_decode: __.typx.Annotated[ 

130 BehaviorTristate, 

131 __.ddoc.Doc( 

132 ''' When to perform trial decode of content with charset. ''' ), 

133 ] = BehaviorTristate.AsNeeded 

134 trial_decode_confidence: __.typx.Annotated[ 

135 float, __.ddoc.Doc( ''' Minimum confidence to skip trial decode. ''') 

136 ] = 0.80 

137 

138 

139BehaviorsArgument: __.typx.TypeAlias = __.typx.Annotated[ 

140 Behaviors, 

141 __.ddoc.Doc( 

142 ''' Configuration for detection and inference behaviors. ''' ), 

143] 

144 

145 

146BEHAVIORS_DEFAULT = Behaviors( ) 

147 

148 

149class CharsetResult( __.immut.DataclassObject ): 

150 ''' Character set encoding with detection confidence. ''' 

151 

152 charset: __.typx.Annotated[ 

153 __.typx.Optional[ str ], 

154 __.ddoc.Doc( 

155 ''' Detected character set encoding. May be ``None``.''' ), 

156 ] 

157 confidence: __.typx.Annotated[ 

158 float, __.ddoc.Doc( ''' Detection confidence from 0.0 to 1.0. ''' ) 

159 ] 

160 

161 

162class MimetypeResult( __.immut.DataclassObject ): 

163 ''' MIME type with detection confidence. ''' 

164 

165 mimetype: __.typx.Annotated[ 

166 str, __.ddoc.Doc( ''' Detected MIME type. ''' ) 

167 ] 

168 confidence: __.typx.Annotated[ 

169 float, __.ddoc.Doc( ''' Detection confidence from 0.0 to 1.0. ''' ) 

170 ] 

171 

172 

173def confidence_from_bytes_quantity( 

174 content: _nomina.Content, behaviors: Behaviors = BEHAVIORS_DEFAULT 

175) -> float: 

176 return min( 

177 1.0, len( content ) / behaviors.bytes_quantity_confidence_divisor )