Coverage for sources/detextive/core.py: 100%
42 statements
« prev ^ index » next coverage.py v7.10.6, created at 2025-09-20 18:02 +0000
« prev ^ index » next coverage.py v7.10.6, created at 2025-09-20 18:02 +0000
1# vim: set filetype=python fileencoding=utf-8:
2# -*- coding: utf-8 -*-
4#============================================================================#
5# #
6# Licensed under the Apache License, Version 2.0 (the "License"); #
7# you may not use this file except in compliance with the License. #
8# You may obtain a copy of the License at #
9# #
10# http://www.apache.org/licenses/LICENSE-2.0 #
11# #
12# Unless required by applicable law or agreed to in writing, software #
13# distributed under the License is distributed on an "AS IS" BASIS, #
14# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. #
15# See the License for the specific language governing permissions and #
16# limitations under the License. #
17# #
18#============================================================================#
21''' Core types and behaviors. '''
24from . import __
25from . import nomina as _nomina
28_STANDARD_CHARSET_PROMOTIONS = (
29 ( 'ascii', 'utf-8-sig' ),
30 ( 'utf-8', 'utf-8-sig' ),
31)
34CHARSET_DEFAULT = 'utf-8'
35MIMETYPE_DEFAULT = 'application/octet-stream'
38class BehaviorTristate( __.enum.Enum ):
39 ''' When to apply behavior. '''
41 Never = __.enum.auto( )
42 AsNeeded = __.enum.auto( )
43 Always = __.enum.auto( )
46class CodecSpecifiers( __.enum.Enum ):
47 ''' Specifiers for dynamic codecs. '''
49 FromInference = __.enum.auto( )
50 OsDefault = __.enum.auto( )
51 PythonDefault = __.enum.auto( )
52 UserSupplement = __.enum.auto( )
55class DetectFailureActions( __.enum.Enum ):
56 ''' Possible responses to detection failure. '''
58 Default = __.enum.auto( )
59 Error = __.enum.auto( )
62class Behaviors( __.immut.DataclassObject ):
63 ''' How functions behave. '''
65 bytes_quantity_confidence_divisor: __.typx.Annotated[
66 int,
67 __.ddoc.Doc(
68 ''' Minimum number of bytes for full detection confidence. ''' ),
69 ] = 1024
70 charset_detect: __.typx.Annotated[
71 BehaviorTristate,
72 __.ddoc.Doc( ''' When to detect charset from content. ''' ),
73 ] = BehaviorTristate.AsNeeded
74 charset_detectors_order: __.typx.Annotated[
75 __.cabc.Sequence[ str ],
76 __.ddoc.Doc(
77 ''' Order in which charset detectors should be applied. ''' ),
78 ] = ( 'chardet', 'charset-normalizer' )
79 charset_on_detect_failure: __.typx.Annotated[
80 DetectFailureActions,
81 __.ddoc.Doc( ''' Action to take on charset detection failure. ''' ),
82 ] = DetectFailureActions.Default
83 charset_promotions: __.typx.Annotated[
84 __.cabc.Mapping[ str, str ],
85 __.ddoc.Doc(
86 ''' Which detected charsets to promote to other charsets.
88 E.g., 7-bit ASCII to UTF-8.
89 ''' ),
90 ] = __.dcls.field(
91 default_factory = (
92 lambda: __.immut.Dictionary( _STANDARD_CHARSET_PROMOTIONS ) ) )
93 mimetype_detect: __.typx.Annotated[
94 BehaviorTristate,
95 __.ddoc.Doc( ''' When to detect MIME type from content. ''' ),
96 ] = BehaviorTristate.AsNeeded
97 mimetype_detectors_order: __.typx.Annotated[
98 __.cabc.Sequence[ str ],
99 __.ddoc.Doc(
100 ''' Order in which MIME type detectors should be applied. ''' ),
101 ] = ( 'magic', 'puremagic' )
102 mimetype_on_detect_failure: __.typx.Annotated[
103 DetectFailureActions,
104 __.ddoc.Doc( ''' Action to take on MIME type detection failure. ''' ),
105 ] = DetectFailureActions.Default
106 on_decode_error: __.typx.Annotated[
107 str,
108 __.ddoc.Doc(
109 ''' Response to charset decoding errors.
111 Standard values are 'ignore', 'replace', and 'strict'.
112 Can also be any other name which has been registered via
113 the 'register_error' function in the Python standard library
114 'codecs' module.
115 ''' ),
116 ] = 'strict'
117 text_validate: __.typx.Annotated[
118 BehaviorTristate,
119 __.ddoc.Doc( ''' When to validate text. ''' ),
120 ] = BehaviorTristate.AsNeeded
121 text_validate_confidence: __.typx.Annotated[
122 float,
123 __.ddoc.Doc( ''' Minimum confidence to skip text validation. ''' ),
124 ] = 0.80
125 trial_codecs: __.typx.Annotated[
126 __.cabc.Sequence[ str | CodecSpecifiers ],
127 __.ddoc.Doc( ''' Sequence of codec names or specifiers. ''' ),
128 ] = ( CodecSpecifiers.FromInference, CodecSpecifiers.UserSupplement )
129 trial_decode: __.typx.Annotated[
130 BehaviorTristate,
131 __.ddoc.Doc(
132 ''' When to perform trial decode of content with charset. ''' ),
133 ] = BehaviorTristate.AsNeeded
134 trial_decode_confidence: __.typx.Annotated[
135 float, __.ddoc.Doc( ''' Minimum confidence to skip trial decode. ''')
136 ] = 0.80
139BehaviorsArgument: __.typx.TypeAlias = __.typx.Annotated[
140 Behaviors,
141 __.ddoc.Doc(
142 ''' Configuration for detection and inference behaviors. ''' ),
143]
146BEHAVIORS_DEFAULT = Behaviors( )
149class CharsetResult( __.immut.DataclassObject ):
150 ''' Character set encoding with detection confidence. '''
152 charset: __.typx.Annotated[
153 __.typx.Optional[ str ],
154 __.ddoc.Doc(
155 ''' Detected character set encoding. May be ``None``.''' ),
156 ]
157 confidence: __.typx.Annotated[
158 float, __.ddoc.Doc( ''' Detection confidence from 0.0 to 1.0. ''' )
159 ]
162class MimetypeResult( __.immut.DataclassObject ):
163 ''' MIME type with detection confidence. '''
165 mimetype: __.typx.Annotated[
166 str, __.ddoc.Doc( ''' Detected MIME type. ''' )
167 ]
168 confidence: __.typx.Annotated[
169 float, __.ddoc.Doc( ''' Detection confidence from 0.0 to 1.0. ''' )
170 ]
173def confidence_from_bytes_quantity(
174 content: _nomina.Content, behaviors: Behaviors = BEHAVIORS_DEFAULT
175) -> float:
176 return min(
177 1.0, len( content ) / behaviors.bytes_quantity_confidence_divisor )