Coverage for sources/detextive/validation.py: 100%

1# vim: set filetype=python fileencoding=utf-8:

2# -*- coding: utf-8 -*-

4#============================================================================#

5# #

6# Licensed under the Apache License, Version 2.0 (the "License"); #

7# you may not use this file except in compliance with the License. #

8# You may obtain a copy of the License at #

9# #

10# http://www.apache.org/licenses/LICENSE-2.0 #

11# #

12# Unless required by applicable law or agreed to in writing, software #

13# distributed under the License is distributed on an "AS IS" BASIS, #

14# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. #

15# See the License for the specific language governing permissions and #

16# limitations under the License. #

17# #

18#============================================================================#

21''' Validation of textual content. '''

24from . import __

27_HYPERCATEGORIES_PRINTABLE = frozenset( ( 'L', 'M', 'N', 'P', 'S', 'Z' ) )

29BOM_CHARACTER = '\ufeff' # UTF Byte-Ordering Mark

30DELETE_CHARACTER = '\x7f'

31ESCAPE_CHARACTER = '\x1b'

33BIDI_ISOLATE_CHARACTERS = frozenset( (

34 # Bidi isolates (Unicode 6.3, recommended)

35 '\u2066', # LEFT-TO-RIGHT ISOLATE (LRI)

36 '\u2067', # RIGHT-TO-LEFT ISOLATE (RLI)

37 '\u2068', # FIRST STRONG ISOLATE (FSI)

38 '\u2069', # POP DIRECTIONAL ISOLATE (PDI)

39) )

40BIDI_LEGACY_CHARACTERS = frozenset( (

41 # Legacy bidi controls (Unicode 3.0, deprecated but still used)

42 '\u202A', # LEFT-TO-RIGHT EMBEDDING (LRE)

43 '\u202B', # RIGHT-TO-LEFT EMBEDDING (RLE)

44 '\u202C', # POP DIRECTIONAL FORMATTING (PDF)

45 '\u202D', # LEFT-TO-RIGHT OVERRIDE (LRO)

46 '\u202E', # RIGHT-TO-LEFT OVERRIDE (RLO)

47) )

48C0_WHITESPACE_CHARACTERS = frozenset( ( '\t', '\n', '\r' ) )

49DIRECTIONAL_MARK_CHARACTERS = frozenset( (

50 '\u061C', # ARABIC LETTER MARK

51 '\u200E', # LEFT-TO-RIGHT MARK (LRM)

52 '\u200F', # RIGHT-TO-LEFT MARK (RLM)

53) )

54ZERO_WIDTH_CHARACTERS = frozenset( (

55 '\u200C', # ZERO WIDTH NON-JOINER (ZWNJ)

56 '\u200D', # ZERO WIDTH JOINER (ZWJ)

57) )

59CONTROL_CHARACTERS_TEXTUAL = (

60 BIDI_ISOLATE_CHARACTERS

61 | BIDI_LEGACY_CHARACTERS

62 | C0_WHITESPACE_CHARACTERS

63 | DIRECTIONAL_MARK_CHARACTERS

64 | ZERO_WIDTH_CHARACTERS )

67class Profile( __.immut.DataclassObject ):

68 ''' Configuration for text validation heuristics. '''

70 acceptable_characters: __.typx.Annotated[

71 __.cabc.Set[ str ],

72 __.ddoc.Doc(

73 ''' Set of characters which are always considered valid. ''' ),

74 ] = CONTROL_CHARACTERS_TEXTUAL

75 check_bom: __.typx.Annotated[

76 bool,

77 __.ddoc.Doc( ''' Allow leading BOM; reject embedded BOMs. ''' ),

78 ] = True

79 printables_ratio_min: __.typx.Annotated[

80 float,

81 __.ddoc.Doc(

82 ''' Minimum ratio of printable characters to total characters.

83 ''' ),

84 ] = 0.85

85 rejectable_characters: __.typx.Annotated[

86 __.cabc.Set[ str ],

87 __.ddoc.Doc(

88 ''' Set of characters which are always considered invalid. ''' ),

89 ] = frozenset( ( DELETE_CHARACTER, ) )

90 rejectable_families: __.typx.Annotated[

91 __.cabc.Set[ str ],

92 __.ddoc.Doc(

93 ''' Set of Unicode categories which are always considered invalid.

94 ''' ),

95 ] = frozenset( ( 'Cc', 'Cf', 'Co', 'Cs' ) )

96 rejectables_ratio_max: __.typx.Annotated[

97 float,

98 __.ddoc.Doc(

99 ''' Maximum ratio of rejectable characters to total characters.

100 ''' ),

101 ] = 0.0

102 sample_quantity: __.typx.Annotated[

103 __.typx.Optional[ int ],

104 __.ddoc.Doc( ''' Number of characters to sample. ''' ),

105 ] = 8192

106 # TODO: check_bidi_safety: validate bidirectional text safety

107 # TODO: normalize_unicode: apply NFC normalization before validation

108 # TODO: permit_ansi_sequences: allow ANSI SGR and other CSI/OSC sequences?

109

110 def __call__( self, text: str ) -> bool:

111 ''' Is text valid against this profile? '''

112 return is_valid_text( text, profile = self )

113

114

115ProfileArgument: __.typx.TypeAlias = __.typx.Annotated[

116 Profile,

117 __.ddoc.Doc( ''' Text validation profile for content analysis. ''' ),

118]

119

120

121PROFILE_PRINTER_SAFE: __.typx.Annotated[

122 Profile, __.ddoc.Doc( ''' Is text safe to send to a printer? ''' ),

123] = Profile(

124 acceptable_characters = ( CONTROL_CHARACTERS_TEXTUAL | { '\f' } ),

125 check_bom = False,

126 rejectable_families = frozenset( ( 'Cc', 'Cf', 'Co', 'Cs', 'Zl', 'Zp' ) ) )

127

128PROFILE_TEXTUAL: __.typx.Annotated[

129 Profile,

130 __.ddoc.Doc(

131 ''' Is text likely from a true textual source?

132

133 I.e., is there a high probability that it is not non-textual

134 data which was able to be successfully decoded as a Unicode string?

135

136 Must contain a sufficient ratio of printable characters to total

137 characters in sample.

138 ''' ),

139] = Profile( )

140

141PROFILE_TERMINAL_SAFE: __.typx.Annotated[

142 Profile,

143 __.ddoc.Doc(

144 ''' Is text safe to display on most terminals?

145

146 The BEL (alert/bell) and ESC (escape) characters are not permitted

147 by this conservative profile.

148 ''' ),

149] = Profile(

150 check_bom = False,

151 rejectable_families = frozenset( ( 'Cc', 'Cf', 'Co', 'Cs', 'Zl', 'Zp' ) ) )

152

153PROFILE_TERMINAL_SAFE_ANSI: __.typx.Annotated[

154 Profile,

155 __.ddoc.Doc(

156 ''' Is text safe to display on terminals with ANSI escapes?

157

158 I.e., text with ANSI CSI/OSC sequences starting with the escape

159 character is permitted by this profile.

160

161 The BEL (alert/bell) character is not permitted.

162 ''' ),

163] = Profile(

164 acceptable_characters = (

165 CONTROL_CHARACTERS_TEXTUAL | { ESCAPE_CHARACTER } ),

166 check_bom = False,

167 rejectable_families = frozenset( ( 'Cc', 'Cf', 'Co', 'Cs', 'Zl', 'Zp' ) ) )

168

169

170def is_valid_text(

171 text: str, /, profile: Profile = PROFILE_TEXTUAL

172) -> bool:

173 ''' Is content valid against profile? '''

174 if not text: return True

175 index_i = 1 if profile.check_bom and text[ 0 ] == BOM_CHARACTER else 0

176 index_f = len( text )

177 if profile.sample_quantity is not None:

178 index_f = min( profile.sample_quantity, index_f )

179 sample = text[ index_i : index_f ]

180 sample_size = len( sample )

181 acceptables = profile.acceptable_characters

182 rejectables = profile.rejectable_characters

183 if 'Cc' in profile.rejectable_families:

184 # Performance: Add C0 control characters to rejectables set.

185 rejectables = rejectables | { chr( i ) for i in range( 0x20 ) }

186 rejectable_families = profile.rejectable_families

187 printables_min = sample_size * profile.printables_ratio_min

188 rejectables_max = sample_size * profile.rejectables_ratio_max

189 printables_count = 0

190 rejectables_count = 0

191 for c in sample:

192 if c in acceptables:

193 if c in C0_WHITESPACE_CHARACTERS: printables_count += 1

194 continue

195 if c in rejectables: rejectables_count += 1

196 else:

197 ucat = __.unicodedata.category( c )

198 if ucat in rejectable_families:

199 rejectables_count += 1

200 elif ucat[ 0 ] in _HYPERCATEGORIES_PRINTABLE:

201 printables_count += 1

202 if rejectables_count > rejectables_max: return False

203 return printables_count >= printables_min

Coverage for sources / detextive / validation.py: 100%

55 statements