Coverage for sources / detextive / validation.py: 100%

55 statements  

« prev     ^ index     » next       coverage.py v7.13.4, created at 2026-02-14 04:38 +0000

1# vim: set filetype=python fileencoding=utf-8: 

2# -*- coding: utf-8 -*- 

3 

4#============================================================================# 

5# # 

6# Licensed under the Apache License, Version 2.0 (the "License"); # 

7# you may not use this file except in compliance with the License. # 

8# You may obtain a copy of the License at # 

9# # 

10# http://www.apache.org/licenses/LICENSE-2.0 # 

11# # 

12# Unless required by applicable law or agreed to in writing, software # 

13# distributed under the License is distributed on an "AS IS" BASIS, # 

14# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # 

15# See the License for the specific language governing permissions and # 

16# limitations under the License. # 

17# # 

18#============================================================================# 

19 

20 

21''' Validation of textual content. ''' 

22 

23 

24from . import __ 

25 

26 

27_HYPERCATEGORIES_PRINTABLE = frozenset( ( 'L', 'M', 'N', 'P', 'S', 'Z' ) ) 

28 

29BOM_CHARACTER = '\ufeff' # UTF Byte-Ordering Mark 

30DELETE_CHARACTER = '\x7f' 

31ESCAPE_CHARACTER = '\x1b' 

32 

33BIDI_ISOLATE_CHARACTERS = frozenset( ( 

34 # Bidi isolates (Unicode 6.3, recommended) 

35 '\u2066', # LEFT-TO-RIGHT ISOLATE (LRI) 

36 '\u2067', # RIGHT-TO-LEFT ISOLATE (RLI) 

37 '\u2068', # FIRST STRONG ISOLATE (FSI) 

38 '\u2069', # POP DIRECTIONAL ISOLATE (PDI) 

39) ) 

40BIDI_LEGACY_CHARACTERS = frozenset( ( 

41 # Legacy bidi controls (Unicode 3.0, deprecated but still used) 

42 '\u202A', # LEFT-TO-RIGHT EMBEDDING (LRE) 

43 '\u202B', # RIGHT-TO-LEFT EMBEDDING (RLE) 

44 '\u202C', # POP DIRECTIONAL FORMATTING (PDF) 

45 '\u202D', # LEFT-TO-RIGHT OVERRIDE (LRO) 

46 '\u202E', # RIGHT-TO-LEFT OVERRIDE (RLO) 

47) ) 

48C0_WHITESPACE_CHARACTERS = frozenset( ( '\t', '\n', '\r' ) ) 

49DIRECTIONAL_MARK_CHARACTERS = frozenset( ( 

50 '\u061C', # ARABIC LETTER MARK 

51 '\u200E', # LEFT-TO-RIGHT MARK (LRM) 

52 '\u200F', # RIGHT-TO-LEFT MARK (RLM) 

53) ) 

54ZERO_WIDTH_CHARACTERS = frozenset( ( 

55 '\u200C', # ZERO WIDTH NON-JOINER (ZWNJ) 

56 '\u200D', # ZERO WIDTH JOINER (ZWJ) 

57) ) 

58 

59CONTROL_CHARACTERS_TEXTUAL = ( 

60 BIDI_ISOLATE_CHARACTERS 

61 | BIDI_LEGACY_CHARACTERS 

62 | C0_WHITESPACE_CHARACTERS 

63 | DIRECTIONAL_MARK_CHARACTERS 

64 | ZERO_WIDTH_CHARACTERS ) 

65 

66 

67class Profile( __.immut.DataclassObject ): 

68 ''' Configuration for text validation heuristics. ''' 

69 

70 acceptable_characters: __.typx.Annotated[ 

71 __.cabc.Set[ str ], 

72 __.ddoc.Doc( 

73 ''' Set of characters which are always considered valid. ''' ), 

74 ] = CONTROL_CHARACTERS_TEXTUAL 

75 check_bom: __.typx.Annotated[ 

76 bool, 

77 __.ddoc.Doc( ''' Allow leading BOM; reject embedded BOMs. ''' ), 

78 ] = True 

79 printables_ratio_min: __.typx.Annotated[ 

80 float, 

81 __.ddoc.Doc( 

82 ''' Minimum ratio of printable characters to total characters. 

83 ''' ), 

84 ] = 0.85 

85 rejectable_characters: __.typx.Annotated[ 

86 __.cabc.Set[ str ], 

87 __.ddoc.Doc( 

88 ''' Set of characters which are always considered invalid. ''' ), 

89 ] = frozenset( ( DELETE_CHARACTER, ) ) 

90 rejectable_families: __.typx.Annotated[ 

91 __.cabc.Set[ str ], 

92 __.ddoc.Doc( 

93 ''' Set of Unicode categories which are always considered invalid. 

94 ''' ), 

95 ] = frozenset( ( 'Cc', 'Cf', 'Co', 'Cs' ) ) 

96 rejectables_ratio_max: __.typx.Annotated[ 

97 float, 

98 __.ddoc.Doc( 

99 ''' Maximum ratio of rejectable characters to total characters. 

100 ''' ), 

101 ] = 0.0 

102 sample_quantity: __.typx.Annotated[ 

103 __.typx.Optional[ int ], 

104 __.ddoc.Doc( ''' Number of characters to sample. ''' ), 

105 ] = 8192 

106 # TODO: check_bidi_safety: validate bidirectional text safety 

107 # TODO: normalize_unicode: apply NFC normalization before validation 

108 # TODO: permit_ansi_sequences: allow ANSI SGR and other CSI/OSC sequences? 

109 

110 def __call__( self, text: str ) -> bool: 

111 ''' Is text valid against this profile? ''' 

112 return is_valid_text( text, profile = self ) 

113 

114 

115ProfileArgument: __.typx.TypeAlias = __.typx.Annotated[ 

116 Profile, 

117 __.ddoc.Doc( ''' Text validation profile for content analysis. ''' ), 

118] 

119 

120 

121PROFILE_PRINTER_SAFE: __.typx.Annotated[ 

122 Profile, __.ddoc.Doc( ''' Is text safe to send to a printer? ''' ), 

123] = Profile( 

124 acceptable_characters = ( CONTROL_CHARACTERS_TEXTUAL | { '\f' } ), 

125 check_bom = False, 

126 rejectable_families = frozenset( ( 'Cc', 'Cf', 'Co', 'Cs', 'Zl', 'Zp' ) ) ) 

127 

128PROFILE_TEXTUAL: __.typx.Annotated[ 

129 Profile, 

130 __.ddoc.Doc( 

131 ''' Is text likely from a true textual source? 

132 

133 I.e., is there a high probability that it is not non-textual 

134 data which was able to be successfully decoded as a Unicode string? 

135 

136 Must contain a sufficient ratio of printable characters to total 

137 characters in sample. 

138 ''' ), 

139] = Profile( ) 

140 

141PROFILE_TERMINAL_SAFE: __.typx.Annotated[ 

142 Profile, 

143 __.ddoc.Doc( 

144 ''' Is text safe to display on most terminals? 

145 

146 The BEL (alert/bell) and ESC (escape) characters are not permitted 

147 by this conservative profile. 

148 ''' ), 

149] = Profile( 

150 check_bom = False, 

151 rejectable_families = frozenset( ( 'Cc', 'Cf', 'Co', 'Cs', 'Zl', 'Zp' ) ) ) 

152 

153PROFILE_TERMINAL_SAFE_ANSI: __.typx.Annotated[ 

154 Profile, 

155 __.ddoc.Doc( 

156 ''' Is text safe to display on terminals with ANSI escapes? 

157 

158 I.e., text with ANSI CSI/OSC sequences starting with the escape 

159 character is permitted by this profile. 

160 

161 The BEL (alert/bell) character is not permitted. 

162 ''' ), 

163] = Profile( 

164 acceptable_characters = ( 

165 CONTROL_CHARACTERS_TEXTUAL | { ESCAPE_CHARACTER } ), 

166 check_bom = False, 

167 rejectable_families = frozenset( ( 'Cc', 'Cf', 'Co', 'Cs', 'Zl', 'Zp' ) ) ) 

168 

169 

170def is_valid_text( 

171 text: str, /, profile: Profile = PROFILE_TEXTUAL 

172) -> bool: 

173 ''' Is content valid against profile? ''' 

174 if not text: return True 

175 index_i = 1 if profile.check_bom and text[ 0 ] == BOM_CHARACTER else 0 

176 index_f = len( text ) 

177 if profile.sample_quantity is not None: 

178 index_f = min( profile.sample_quantity, index_f ) 

179 sample = text[ index_i : index_f ] 

180 sample_size = len( sample ) 

181 acceptables = profile.acceptable_characters 

182 rejectables = profile.rejectable_characters 

183 if 'Cc' in profile.rejectable_families: 

184 # Performance: Add C0 control characters to rejectables set. 

185 rejectables = rejectables | { chr( i ) for i in range( 0x20 ) } 

186 rejectable_families = profile.rejectable_families 

187 printables_min = sample_size * profile.printables_ratio_min 

188 rejectables_max = sample_size * profile.rejectables_ratio_max 

189 printables_count = 0 

190 rejectables_count = 0 

191 for c in sample: 

192 if c in acceptables: 

193 if c in C0_WHITESPACE_CHARACTERS: printables_count += 1 

194 continue 

195 if c in rejectables: rejectables_count += 1 

196 else: 

197 ucat = __.unicodedata.category( c ) 

198 if ucat in rejectable_families: 

199 rejectables_count += 1 

200 elif ucat[ 0 ] in _HYPERCATEGORIES_PRINTABLE: 

201 printables_count += 1 

202 if rejectables_count > rejectables_max: return False 

203 return printables_count >= printables_min