Coverage for sources/detextive/detection.py: 100%

58 statements  

« prev     ^ index     » next       coverage.py v7.10.3, created at 2025-08-12 18:11 +0000

1# vim: set filetype=python fileencoding=utf-8: 

2# -*- coding: utf-8 -*- 

3 

4#============================================================================# 

5# # 

6# Licensed under the Apache License, Version 2.0 (the "License"); # 

7# you may not use this file except in compliance with the License. # 

8# You may obtain a copy of the License at # 

9# # 

10# http://www.apache.org/licenses/LICENSE-2.0 # 

11# # 

12# Unless required by applicable law or agreed to in writing, software # 

13# distributed under the License is distributed on an "AS IS" BASIS, # 

14# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # 

15# See the License for the specific language governing permissions and # 

16# limitations under the License. # 

17# # 

18#============================================================================# 

19 

20 

21''' Core detection function implementations. ''' 

22 

23 

24from . import __ 

25from . import exceptions as _exceptions 

26 

27 

28Content: __.typx.TypeAlias = __.typx.Annotated[ 

29 bytes, 

30 __.ddoc.Doc( "Raw byte content for analysis." ) 

31] 

32Location: __.typx.TypeAlias = __.typx.Annotated[ 

33 str | __.Path, 

34 __.ddoc.Doc( "File path, URL, or path components for context." ) 

35] 

36 

37_TEXTUAL_MIME_TYPES = frozenset( ( 

38 'application/ecmascript', 

39 'application/graphql', 

40 'application/javascript', 

41 'application/json', 

42 'application/ld+json', 

43 'application/x-httpd-php', 

44 'application/x-javascript', 

45 'application/x-latex', 

46 'application/x-perl', 

47 'application/x-php', 

48 'application/x-python', 

49 'application/x-ruby', 

50 'application/x-shell', 

51 'application/x-tex', 

52 'application/x-yaml', 

53 'application/xhtml+xml', 

54 'application/xml', 

55 'application/yaml', 

56 'image/svg+xml', 

57) ) 

58_TEXTUAL_SUFFIXES = ( '+xml', '+json', '+yaml', '+toml' ) 

59 

60 

61def detect_charset( content: Content ) -> __.typx.Optional[ str ]: 

62 ''' Detects character encoding with UTF-8 preference and validation. 

63 

64 Returns None if no reliable encoding can be determined. 

65 ''' 

66 result = __.chardet.detect( content ) 

67 charset = result[ 'encoding' ] 

68 if charset is None: return charset 

69 if charset.startswith( 'utf' ): return charset 

70 match charset: 

71 case 'ascii': return 'utf-8' # Assume superset 

72 case _: pass 

73 # Shake out false positives, like 'MacRoman' 

74 try: content.decode( 'utf-8' ) 

75 except UnicodeDecodeError: return charset 

76 return 'utf-8' 

77 

78 

79def detect_mimetype( 

80 content: Content, 

81 location: Location 

82) -> __.typx.Optional[ str ]: 

83 ''' Detects MIME type using content analysis and extension fallback. 

84 

85 Returns standardized MIME type strings or None if detection fails. 

86 ''' 

87 try: return __.puremagic.from_string( content, mime = True ) 

88 except ( __.puremagic.PureError, ValueError ): 

89 return __.mimetypes.guess_type( str( location ) )[ 0 ] 

90 

91 

92def detect_mimetype_and_charset( 

93 content: Content, 

94 location: Location, *, 

95 mimetype: __.Absential[ str ] = __.absent, 

96 charset: __.Absential[ str ] = __.absent, 

97) -> tuple[ str, __.typx.Optional[ str ] ]: 

98 ''' Detects MIME type and charset with optional parameter overrides. 

99 

100 Returns tuple of (mimetype, charset). MIME type defaults to 

101 'text/plain' if charset detected but MIME type unknown, or 

102 'application/octet-stream' if neither detected. 

103 ''' 

104 mimetype_ = ( 

105 detect_mimetype( content, location ) 

106 if __.is_absent( mimetype ) else mimetype ) 

107 charset_ = ( 

108 detect_charset( content ) if __.is_absent( charset ) else charset ) 

109 if not mimetype_: 

110 if charset_: 

111 mimetype_ = 'text/plain' 

112 try: 

113 _validate_mimetype_with_trial_decode( 

114 content, str( location ), mimetype_, charset_ ) 

115 except _exceptions.TextualMimetypeInvalidity: pass 

116 else: return mimetype_, charset_ 

117 mimetype_ = 'application/octet-stream' 

118 if is_textual_mimetype( mimetype_ ): return mimetype_, charset_ 

119 if not __.is_absent( charset ): 

120 _validate_mimetype_with_trial_decode( 

121 content, str( location ), mimetype_, charset ) 

122 return mimetype_, charset 

123 return mimetype_, None # no charset for non-textual content 

124 

125 

126def is_textual_mimetype( mimetype: str ) -> bool: 

127 ''' Validates if MIME type represents textual content. 

128 

129 Consolidates textual MIME type patterns from all source 

130 implementations. Supports text/* prefix, specific application 

131 types (JSON, XML, JavaScript, etc.), and textual suffixes 

132 (+xml, +json, +yaml, +toml). 

133 

134 Returns True for MIME types representing textual content. 

135 ''' 

136 if mimetype.startswith( ( 'text/', 'text/x-' ) ): return True 

137 if mimetype in _TEXTUAL_MIME_TYPES: return True 

138 return mimetype.endswith( _TEXTUAL_SUFFIXES ) 

139 

140 

141def is_textual_content( content: bytes ) -> bool: 

142 ''' Determines if byte content represents textual data. 

143 

144 Returns True for content that can be reliably processed as text. 

145 ''' 

146 mimetype, charset = detect_mimetype_and_charset( content, 'unknown' ) 

147 return charset is not None and is_textual_mimetype( mimetype ) 

148 

149 

150def _is_probable_textual_content( content: str ) -> bool: 

151 ''' Validates decoded content using heuristic analysis. 

152 

153 Applies heuristics to detect meaningful text vs binary data: 

154 - Limits control characters to <10% (excluding common whitespace) 

155 - Requires >=80% printable characters 

156 

157 Returns True for content likely to be meaningful text. 

158 ''' 

159 if not content: return False 

160 common_whitespace = '\t\n\r' 

161 ascii_control_limit = 32 

162 control_chars = sum( 

163 1 for c in content 

164 if ord( c ) < ascii_control_limit and c not in common_whitespace ) 

165 if control_chars > len( content ) * 0.1: return False 

166 printable_chars = sum( 

167 1 for c in content 

168 if c.isprintable( ) or c in common_whitespace ) 

169 return printable_chars >= len( content ) * 0.8 

170 

171 

172def _validate_mimetype_with_trial_decode( 

173 content: bytes, location: Location, mimetype: str, charset: str 

174) -> None: 

175 ''' Validates charset fallback and returns appropriate MIME type. ''' 

176 try: text = content.decode( charset ) 

177 except ( UnicodeDecodeError, LookupError ) as exc: 

178 raise _exceptions.TextualMimetypeInvalidity( 

179 str( location ), mimetype ) from exc 

180 if not _is_probable_textual_content( text ): 

181 raise _exceptions.TextualMimetypeInvalidity( 

182 str( location ), mimetype )