Coverage for sources/detextive/core.py: 100%

1# vim: set filetype=python fileencoding=utf-8:

2# -*- coding: utf-8 -*-

4#============================================================================#

5# #

6# Licensed under the Apache License, Version 2.0 (the "License"); #

7# you may not use this file except in compliance with the License. #

8# You may obtain a copy of the License at #

9# #

10# http://www.apache.org/licenses/LICENSE-2.0 #

11# #

12# Unless required by applicable law or agreed to in writing, software #

13# distributed under the License is distributed on an "AS IS" BASIS, #

14# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. #

15# See the License for the specific language governing permissions and #

16# limitations under the License. #

17# #

18#============================================================================#

21''' Core types and behaviors. '''

24from . import __

25from . import exceptions as _exceptions

26from . import nomina as _nomina

29CHARSET_DEFAULT = 'utf-8'

30MIMETYPE_DEFAULT = 'application/octet-stream'

33class BehaviorTristate( __.enum.Enum ):

34 ''' When to apply behavior. '''

36 Never = __.enum.auto( )

37 AsNeeded = __.enum.auto( )

38 Always = __.enum.auto( )

41class CodecSpecifiers( __.enum.Enum ):

42 ''' Specifiers for dynamic codecs. '''

44 FromInference = __.enum.auto( )

45 OsDefault = __.enum.auto( )

46 PythonDefault = __.enum.auto( )

47 UserSupplement = __.enum.auto( )

50class DetectFailureActions( __.enum.Enum ):

51 ''' Possible responses to detection failure. '''

53 Default = __.enum.auto( )

54 Error = __.enum.auto( )

57class Behaviors( __.immut.DataclassObject ):

58 ''' How functions behave. '''

60 bytes_quantity_confidence_divisor: __.typx.Annotated[

61 int,

62 __.ddoc.Doc(

63 ''' Minimum number of bytes for full detection confidence. ''' ),

64 ] = 1024

65 charset_detect: __.typx.Annotated[

66 bool,

67 __.ddoc.Doc( ''' Whether to detect charset from content. ''' ),

68 ] = True

69 charset_detectors_order: __.typx.Annotated[

70 __.cabc.Sequence[ str ],

71 __.ddoc.Doc(

72 ''' Order in which charset detectors should be applied. ''' ),

73 ] = ( 'chardet', 'charset-normalizer' )

74 charset_on_detect_failure: __.typx.Annotated[

75 DetectFailureActions,

76 __.ddoc.Doc( ''' Action to take on charset detection failure. ''' ),

77 ] = DetectFailureActions.Default

78 mimetype_detect: __.typx.Annotated[

79 bool,

80 __.ddoc.Doc( ''' Whether to detect MIME type from content. ''' ),

81 ] = True

82 mimetype_detectors_order: __.typx.Annotated[

83 __.cabc.Sequence[ str ],

84 __.ddoc.Doc(

85 ''' Order in which MIME type detectors should be applied. ''' ),

86 ] = ( 'magic', 'puremagic' )

87 mimetype_on_detect_failure: __.typx.Annotated[

88 DetectFailureActions,

89 __.ddoc.Doc( ''' Action to take on MIME type detection failure. ''' ),

90 ] = DetectFailureActions.Default

91 on_decode_error: __.typx.Annotated[

92 str,

93 __.ddoc.Doc(

94 ''' Response to charset decoding errors.

96 Standard values are 'ignore', 'replace', and 'strict'.

97 Can also be any other name which has been registered via

98 the 'register_error' function in the Python standard library

99 'codecs' module.

100 ''' ),

101 ] = 'strict'

102 remove_bom: __.typx.Annotated[

103 bool, __.ddoc.Doc( ''' Remove byte-ordering mark? ''' )

104 ] = True

105 text_validate: __.typx.Annotated[

106 BehaviorTristate,

107 __.ddoc.Doc( ''' When to validate text. ''' ),

108 ] = BehaviorTristate.AsNeeded

109 text_validate_confidence: __.typx.Annotated[

110 float,

111 __.ddoc.Doc( ''' Minimum confidence to skip text validation. ''' ),

112 ] = 0.80

113 trial_codecs: __.typx.Annotated[

114 __.cabc.Sequence[ str | CodecSpecifiers ],

115 __.ddoc.Doc( ''' Sequence of codec names or specifiers. ''' ),

116 ] = (

117 CodecSpecifiers.UserSupplement,

118 'utf-8',

119 CodecSpecifiers.FromInference,

120 CodecSpecifiers.OsDefault,

121 CodecSpecifiers.PythonDefault,

122 )

123 trial_decode: __.typx.Annotated[

124 BehaviorTristate,

125 __.ddoc.Doc(

126 ''' When to perform trial decode of content with charset. ''' ),

127 ] = BehaviorTristate.AsNeeded

128 trial_decode_confidence: __.typx.Annotated[

129 float, __.ddoc.Doc( ''' Minimum confidence to skip trial decode. ''')

130 ] = 0.80

131

132 def __post_init__( self ) -> None:

133 if not isinstance( self.charset_detect, bool ):

134 raise _exceptions.BehaviorsInvalidity(

135 'charset_detect', 'a boolean' )

136 if not isinstance( self.mimetype_detect, bool ):

137 raise _exceptions.BehaviorsInvalidity(

138 'mimetype_detect', 'a boolean' )

139

140

141BehaviorsArgument: __.typx.TypeAlias = __.typx.Annotated[

142 Behaviors,

143 __.ddoc.Doc(

144 ''' Configuration for detection and inference behaviors. ''' ),

145]

146

147

148BEHAVIORS_DEFAULT = Behaviors( )

149

150

151class CharsetResult( __.immut.DataclassObject ):

152 ''' Character set encoding with detection confidence. '''

153

154 charset: __.typx.Annotated[

155 __.typx.Optional[ str ],

156 __.ddoc.Doc(

157 ''' Detected character set encoding. May be ``None``.''' ),

158 ]

159 confidence: __.typx.Annotated[

160 float, __.ddoc.Doc( ''' Detection confidence from 0.0 to 1.0. ''' )

161 ]

162

163

164class MimetypeResult( __.immut.DataclassObject ):

165 ''' MIME type with detection confidence. '''

166

167 mimetype: __.typx.Annotated[

168 str, __.ddoc.Doc( ''' Detected MIME type. ''' )

169 ]

170 confidence: __.typx.Annotated[

171 float, __.ddoc.Doc( ''' Detection confidence from 0.0 to 1.0. ''' )

172 ]

173

174

175def confidence_from_bytes_quantity(

176 content: _nomina.Content, behaviors: Behaviors = BEHAVIORS_DEFAULT

177) -> float:

178 return min(

179 1.0, len( content ) / behaviors.bytes_quantity_confidence_divisor )

Coverage for sources / detextive / core.py: 100%

47 statements