Coverage for sources / detextive / validation.py: 100%
55 statements
« prev ^ index » next coverage.py v7.13.4, created at 2026-02-14 04:38 +0000
« prev ^ index » next coverage.py v7.13.4, created at 2026-02-14 04:38 +0000
1# vim: set filetype=python fileencoding=utf-8:
2# -*- coding: utf-8 -*-
4#============================================================================#
5# #
6# Licensed under the Apache License, Version 2.0 (the "License"); #
7# you may not use this file except in compliance with the License. #
8# You may obtain a copy of the License at #
9# #
10# http://www.apache.org/licenses/LICENSE-2.0 #
11# #
12# Unless required by applicable law or agreed to in writing, software #
13# distributed under the License is distributed on an "AS IS" BASIS, #
14# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. #
15# See the License for the specific language governing permissions and #
16# limitations under the License. #
17# #
18#============================================================================#
21''' Validation of textual content. '''
24from . import __
27_HYPERCATEGORIES_PRINTABLE = frozenset( ( 'L', 'M', 'N', 'P', 'S', 'Z' ) )
29BOM_CHARACTER = '\ufeff' # UTF Byte-Ordering Mark
30DELETE_CHARACTER = '\x7f'
31ESCAPE_CHARACTER = '\x1b'
33BIDI_ISOLATE_CHARACTERS = frozenset( (
34 # Bidi isolates (Unicode 6.3, recommended)
35 '\u2066', # LEFT-TO-RIGHT ISOLATE (LRI)
36 '\u2067', # RIGHT-TO-LEFT ISOLATE (RLI)
37 '\u2068', # FIRST STRONG ISOLATE (FSI)
38 '\u2069', # POP DIRECTIONAL ISOLATE (PDI)
39) )
40BIDI_LEGACY_CHARACTERS = frozenset( (
41 # Legacy bidi controls (Unicode 3.0, deprecated but still used)
42 '\u202A', # LEFT-TO-RIGHT EMBEDDING (LRE)
43 '\u202B', # RIGHT-TO-LEFT EMBEDDING (RLE)
44 '\u202C', # POP DIRECTIONAL FORMATTING (PDF)
45 '\u202D', # LEFT-TO-RIGHT OVERRIDE (LRO)
46 '\u202E', # RIGHT-TO-LEFT OVERRIDE (RLO)
47) )
48C0_WHITESPACE_CHARACTERS = frozenset( ( '\t', '\n', '\r' ) )
49DIRECTIONAL_MARK_CHARACTERS = frozenset( (
50 '\u061C', # ARABIC LETTER MARK
51 '\u200E', # LEFT-TO-RIGHT MARK (LRM)
52 '\u200F', # RIGHT-TO-LEFT MARK (RLM)
53) )
54ZERO_WIDTH_CHARACTERS = frozenset( (
55 '\u200C', # ZERO WIDTH NON-JOINER (ZWNJ)
56 '\u200D', # ZERO WIDTH JOINER (ZWJ)
57) )
59CONTROL_CHARACTERS_TEXTUAL = (
60 BIDI_ISOLATE_CHARACTERS
61 | BIDI_LEGACY_CHARACTERS
62 | C0_WHITESPACE_CHARACTERS
63 | DIRECTIONAL_MARK_CHARACTERS
64 | ZERO_WIDTH_CHARACTERS )
67class Profile( __.immut.DataclassObject ):
68 ''' Configuration for text validation heuristics. '''
70 acceptable_characters: __.typx.Annotated[
71 __.cabc.Set[ str ],
72 __.ddoc.Doc(
73 ''' Set of characters which are always considered valid. ''' ),
74 ] = CONTROL_CHARACTERS_TEXTUAL
75 check_bom: __.typx.Annotated[
76 bool,
77 __.ddoc.Doc( ''' Allow leading BOM; reject embedded BOMs. ''' ),
78 ] = True
79 printables_ratio_min: __.typx.Annotated[
80 float,
81 __.ddoc.Doc(
82 ''' Minimum ratio of printable characters to total characters.
83 ''' ),
84 ] = 0.85
85 rejectable_characters: __.typx.Annotated[
86 __.cabc.Set[ str ],
87 __.ddoc.Doc(
88 ''' Set of characters which are always considered invalid. ''' ),
89 ] = frozenset( ( DELETE_CHARACTER, ) )
90 rejectable_families: __.typx.Annotated[
91 __.cabc.Set[ str ],
92 __.ddoc.Doc(
93 ''' Set of Unicode categories which are always considered invalid.
94 ''' ),
95 ] = frozenset( ( 'Cc', 'Cf', 'Co', 'Cs' ) )
96 rejectables_ratio_max: __.typx.Annotated[
97 float,
98 __.ddoc.Doc(
99 ''' Maximum ratio of rejectable characters to total characters.
100 ''' ),
101 ] = 0.0
102 sample_quantity: __.typx.Annotated[
103 __.typx.Optional[ int ],
104 __.ddoc.Doc( ''' Number of characters to sample. ''' ),
105 ] = 8192
106 # TODO: check_bidi_safety: validate bidirectional text safety
107 # TODO: normalize_unicode: apply NFC normalization before validation
108 # TODO: permit_ansi_sequences: allow ANSI SGR and other CSI/OSC sequences?
110 def __call__( self, text: str ) -> bool:
111 ''' Is text valid against this profile? '''
112 return is_valid_text( text, profile = self )
115ProfileArgument: __.typx.TypeAlias = __.typx.Annotated[
116 Profile,
117 __.ddoc.Doc( ''' Text validation profile for content analysis. ''' ),
118]
121PROFILE_PRINTER_SAFE: __.typx.Annotated[
122 Profile, __.ddoc.Doc( ''' Is text safe to send to a printer? ''' ),
123] = Profile(
124 acceptable_characters = ( CONTROL_CHARACTERS_TEXTUAL | { '\f' } ),
125 check_bom = False,
126 rejectable_families = frozenset( ( 'Cc', 'Cf', 'Co', 'Cs', 'Zl', 'Zp' ) ) )
128PROFILE_TEXTUAL: __.typx.Annotated[
129 Profile,
130 __.ddoc.Doc(
131 ''' Is text likely from a true textual source?
133 I.e., is there a high probability that it is not non-textual
134 data which was able to be successfully decoded as a Unicode string?
136 Must contain a sufficient ratio of printable characters to total
137 characters in sample.
138 ''' ),
139] = Profile( )
141PROFILE_TERMINAL_SAFE: __.typx.Annotated[
142 Profile,
143 __.ddoc.Doc(
144 ''' Is text safe to display on most terminals?
146 The BEL (alert/bell) and ESC (escape) characters are not permitted
147 by this conservative profile.
148 ''' ),
149] = Profile(
150 check_bom = False,
151 rejectable_families = frozenset( ( 'Cc', 'Cf', 'Co', 'Cs', 'Zl', 'Zp' ) ) )
153PROFILE_TERMINAL_SAFE_ANSI: __.typx.Annotated[
154 Profile,
155 __.ddoc.Doc(
156 ''' Is text safe to display on terminals with ANSI escapes?
158 I.e., text with ANSI CSI/OSC sequences starting with the escape
159 character is permitted by this profile.
161 The BEL (alert/bell) character is not permitted.
162 ''' ),
163] = Profile(
164 acceptable_characters = (
165 CONTROL_CHARACTERS_TEXTUAL | { ESCAPE_CHARACTER } ),
166 check_bom = False,
167 rejectable_families = frozenset( ( 'Cc', 'Cf', 'Co', 'Cs', 'Zl', 'Zp' ) ) )
170def is_valid_text(
171 text: str, /, profile: Profile = PROFILE_TEXTUAL
172) -> bool:
173 ''' Is content valid against profile? '''
174 if not text: return True
175 index_i = 1 if profile.check_bom and text[ 0 ] == BOM_CHARACTER else 0
176 index_f = len( text )
177 if profile.sample_quantity is not None:
178 index_f = min( profile.sample_quantity, index_f )
179 sample = text[ index_i : index_f ]
180 sample_size = len( sample )
181 acceptables = profile.acceptable_characters
182 rejectables = profile.rejectable_characters
183 if 'Cc' in profile.rejectable_families:
184 # Performance: Add C0 control characters to rejectables set.
185 rejectables = rejectables | { chr( i ) for i in range( 0x20 ) }
186 rejectable_families = profile.rejectable_families
187 printables_min = sample_size * profile.printables_ratio_min
188 rejectables_max = sample_size * profile.rejectables_ratio_max
189 printables_count = 0
190 rejectables_count = 0
191 for c in sample:
192 if c in acceptables:
193 if c in C0_WHITESPACE_CHARACTERS: printables_count += 1
194 continue
195 if c in rejectables: rejectables_count += 1
196 else:
197 ucat = __.unicodedata.category( c )
198 if ucat in rejectable_families:
199 rejectables_count += 1
200 elif ucat[ 0 ] in _HYPERCATEGORIES_PRINTABLE:
201 printables_count += 1
202 if rejectables_count > rejectables_max: return False
203 return printables_count >= printables_min