Coverage for sources / detextive / core.py: 100%
47 statements
« prev ^ index » next coverage.py v7.13.4, created at 2026-02-14 04:38 +0000
« prev ^ index » next coverage.py v7.13.4, created at 2026-02-14 04:38 +0000
1# vim: set filetype=python fileencoding=utf-8:
2# -*- coding: utf-8 -*-
4#============================================================================#
5# #
6# Licensed under the Apache License, Version 2.0 (the "License"); #
7# you may not use this file except in compliance with the License. #
8# You may obtain a copy of the License at #
9# #
10# http://www.apache.org/licenses/LICENSE-2.0 #
11# #
12# Unless required by applicable law or agreed to in writing, software #
13# distributed under the License is distributed on an "AS IS" BASIS, #
14# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. #
15# See the License for the specific language governing permissions and #
16# limitations under the License. #
17# #
18#============================================================================#
21''' Core types and behaviors. '''
24from . import __
25from . import exceptions as _exceptions
26from . import nomina as _nomina
29CHARSET_DEFAULT = 'utf-8'
30MIMETYPE_DEFAULT = 'application/octet-stream'
33class BehaviorTristate( __.enum.Enum ):
34 ''' When to apply behavior. '''
36 Never = __.enum.auto( )
37 AsNeeded = __.enum.auto( )
38 Always = __.enum.auto( )
41class CodecSpecifiers( __.enum.Enum ):
42 ''' Specifiers for dynamic codecs. '''
44 FromInference = __.enum.auto( )
45 OsDefault = __.enum.auto( )
46 PythonDefault = __.enum.auto( )
47 UserSupplement = __.enum.auto( )
50class DetectFailureActions( __.enum.Enum ):
51 ''' Possible responses to detection failure. '''
53 Default = __.enum.auto( )
54 Error = __.enum.auto( )
57class Behaviors( __.immut.DataclassObject ):
58 ''' How functions behave. '''
60 bytes_quantity_confidence_divisor: __.typx.Annotated[
61 int,
62 __.ddoc.Doc(
63 ''' Minimum number of bytes for full detection confidence. ''' ),
64 ] = 1024
65 charset_detect: __.typx.Annotated[
66 bool,
67 __.ddoc.Doc( ''' Whether to detect charset from content. ''' ),
68 ] = True
69 charset_detectors_order: __.typx.Annotated[
70 __.cabc.Sequence[ str ],
71 __.ddoc.Doc(
72 ''' Order in which charset detectors should be applied. ''' ),
73 ] = ( 'chardet', 'charset-normalizer' )
74 charset_on_detect_failure: __.typx.Annotated[
75 DetectFailureActions,
76 __.ddoc.Doc( ''' Action to take on charset detection failure. ''' ),
77 ] = DetectFailureActions.Default
78 mimetype_detect: __.typx.Annotated[
79 bool,
80 __.ddoc.Doc( ''' Whether to detect MIME type from content. ''' ),
81 ] = True
82 mimetype_detectors_order: __.typx.Annotated[
83 __.cabc.Sequence[ str ],
84 __.ddoc.Doc(
85 ''' Order in which MIME type detectors should be applied. ''' ),
86 ] = ( 'magic', 'puremagic' )
87 mimetype_on_detect_failure: __.typx.Annotated[
88 DetectFailureActions,
89 __.ddoc.Doc( ''' Action to take on MIME type detection failure. ''' ),
90 ] = DetectFailureActions.Default
91 on_decode_error: __.typx.Annotated[
92 str,
93 __.ddoc.Doc(
94 ''' Response to charset decoding errors.
96 Standard values are 'ignore', 'replace', and 'strict'.
97 Can also be any other name which has been registered via
98 the 'register_error' function in the Python standard library
99 'codecs' module.
100 ''' ),
101 ] = 'strict'
102 remove_bom: __.typx.Annotated[
103 bool, __.ddoc.Doc( ''' Remove byte-ordering mark? ''' )
104 ] = True
105 text_validate: __.typx.Annotated[
106 BehaviorTristate,
107 __.ddoc.Doc( ''' When to validate text. ''' ),
108 ] = BehaviorTristate.AsNeeded
109 text_validate_confidence: __.typx.Annotated[
110 float,
111 __.ddoc.Doc( ''' Minimum confidence to skip text validation. ''' ),
112 ] = 0.80
113 trial_codecs: __.typx.Annotated[
114 __.cabc.Sequence[ str | CodecSpecifiers ],
115 __.ddoc.Doc( ''' Sequence of codec names or specifiers. ''' ),
116 ] = (
117 CodecSpecifiers.UserSupplement,
118 'utf-8',
119 CodecSpecifiers.FromInference,
120 CodecSpecifiers.OsDefault,
121 CodecSpecifiers.PythonDefault,
122 )
123 trial_decode: __.typx.Annotated[
124 BehaviorTristate,
125 __.ddoc.Doc(
126 ''' When to perform trial decode of content with charset. ''' ),
127 ] = BehaviorTristate.AsNeeded
128 trial_decode_confidence: __.typx.Annotated[
129 float, __.ddoc.Doc( ''' Minimum confidence to skip trial decode. ''')
130 ] = 0.80
132 def __post_init__( self ) -> None:
133 if not isinstance( self.charset_detect, bool ):
134 raise _exceptions.BehaviorsInvalidity(
135 'charset_detect', 'a boolean' )
136 if not isinstance( self.mimetype_detect, bool ):
137 raise _exceptions.BehaviorsInvalidity(
138 'mimetype_detect', 'a boolean' )
141BehaviorsArgument: __.typx.TypeAlias = __.typx.Annotated[
142 Behaviors,
143 __.ddoc.Doc(
144 ''' Configuration for detection and inference behaviors. ''' ),
145]
148BEHAVIORS_DEFAULT = Behaviors( )
151class CharsetResult( __.immut.DataclassObject ):
152 ''' Character set encoding with detection confidence. '''
154 charset: __.typx.Annotated[
155 __.typx.Optional[ str ],
156 __.ddoc.Doc(
157 ''' Detected character set encoding. May be ``None``.''' ),
158 ]
159 confidence: __.typx.Annotated[
160 float, __.ddoc.Doc( ''' Detection confidence from 0.0 to 1.0. ''' )
161 ]
164class MimetypeResult( __.immut.DataclassObject ):
165 ''' MIME type with detection confidence. '''
167 mimetype: __.typx.Annotated[
168 str, __.ddoc.Doc( ''' Detected MIME type. ''' )
169 ]
170 confidence: __.typx.Annotated[
171 float, __.ddoc.Doc( ''' Detection confidence from 0.0 to 1.0. ''' )
172 ]
175def confidence_from_bytes_quantity(
176 content: _nomina.Content, behaviors: Behaviors = BEHAVIORS_DEFAULT
177) -> float:
178 return min(
179 1.0, len( content ) / behaviors.bytes_quantity_confidence_divisor )