Coverage for sources/detextive/detection.py: 100%

1# vim: set filetype=python fileencoding=utf-8:

2# -*- coding: utf-8 -*-

4#============================================================================#

5# #

6# Licensed under the Apache License, Version 2.0 (the "License"); #

7# you may not use this file except in compliance with the License. #

8# You may obtain a copy of the License at #

9# #

10# http://www.apache.org/licenses/LICENSE-2.0 #

11# #

12# Unless required by applicable law or agreed to in writing, software #

13# distributed under the License is distributed on an "AS IS" BASIS, #

14# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. #

15# See the License for the specific language governing permissions and #

16# limitations under the License. #

17# #

18#============================================================================#

21''' Core detection function implementations. '''

24from . import __

25from . import exceptions as _exceptions

28Content: __.typx.TypeAlias = __.typx.Annotated[

29 bytes,

30 __.ddoc.Doc( "Raw byte content for analysis." )

31]

32Location: __.typx.TypeAlias = __.typx.Annotated[

33 str | __.Path,

34 __.ddoc.Doc( "File path, URL, or path components for context." )

35]

37_TEXTUAL_MIME_TYPES = frozenset( (

38 'application/ecmascript',

39 'application/graphql',

40 'application/javascript',

41 'application/json',

42 'application/ld+json',

43 'application/x-httpd-php',

44 'application/x-javascript',

45 'application/x-latex',

46 'application/x-perl',

47 'application/x-php',

48 'application/x-python',

49 'application/x-ruby',

50 'application/x-shell',

51 'application/x-tex',

52 'application/x-yaml',

53 'application/xhtml+xml',

54 'application/xml',

55 'application/yaml',

56 'image/svg+xml',

57) )

58_TEXTUAL_SUFFIXES = ( '+xml', '+json', '+yaml', '+toml' )

61def detect_charset( content: Content ) -> __.typx.Optional[ str ]:

62 ''' Detects character encoding with UTF-8 preference and validation.

64 Returns None if no reliable encoding can be determined.

65 '''

66 result = __.chardet.detect( content )

67 charset = result[ 'encoding' ]

68 if charset is None: return charset

69 if charset.startswith( 'utf' ): return charset

70 match charset:

71 case 'ascii': return 'utf-8' # Assume superset

72 case _: pass

73 # Shake out false positives, like 'MacRoman'

74 try: content.decode( 'utf-8' )

75 except UnicodeDecodeError: return charset

76 return 'utf-8'

79def detect_mimetype(

80 content: Content,

81 location: Location

82) -> __.typx.Optional[ str ]:

83 ''' Detects MIME type using content analysis and extension fallback.

85 Returns standardized MIME type strings or None if detection fails.

86 '''

87 try: return __.puremagic.from_string( content, mime = True )

88 except ( __.puremagic.PureError, ValueError ):

89 return __.mimetypes.guess_type( str( location ) )[ 0 ]

92def detect_mimetype_and_charset(

93 content: Content,

94 location: Location, *,

95 mimetype: __.Absential[ str ] = __.absent,

96 charset: __.Absential[ str ] = __.absent,

97) -> tuple[ str, __.typx.Optional[ str ] ]:

98 ''' Detects MIME type and charset with optional parameter overrides.

100 Returns tuple of (mimetype, charset). MIME type defaults to

101 'text/plain' if charset detected but MIME type unknown, or

102 'application/octet-stream' if neither detected.

103 '''

104 mimetype_ = (

105 detect_mimetype( content, location )

106 if __.is_absent( mimetype ) else mimetype )

107 charset_ = (

108 detect_charset( content ) if __.is_absent( charset ) else charset )

109 if not mimetype_:

110 if charset_:

111 mimetype_ = 'text/plain'

112 try:

113 _validate_mimetype_with_trial_decode(

114 content, str( location ), mimetype_, charset_ )

115 except _exceptions.TextualMimetypeInvalidity: pass

116 else: return mimetype_, charset_

117 mimetype_ = 'application/octet-stream'

118 if is_textual_mimetype( mimetype_ ): return mimetype_, charset_

119 if not __.is_absent( charset ):

120 _validate_mimetype_with_trial_decode(

121 content, str( location ), mimetype_, charset )

122 return mimetype_, charset

123 return mimetype_, None # no charset for non-textual content

124

125

126def is_textual_mimetype( mimetype: str ) -> bool:

127 ''' Validates if MIME type represents textual content.

128

129 Consolidates textual MIME type patterns from all source

130 implementations. Supports text/* prefix, specific application

131 types (JSON, XML, JavaScript, etc.), and textual suffixes

132 (+xml, +json, +yaml, +toml).

133

134 Returns True for MIME types representing textual content.

135 '''

136 if mimetype.startswith( ( 'text/', 'text/x-' ) ): return True

137 if mimetype in _TEXTUAL_MIME_TYPES: return True

138 return mimetype.endswith( _TEXTUAL_SUFFIXES )

139

140

141def is_textual_content( content: bytes ) -> bool:

142 ''' Determines if byte content represents textual data.

143

144 Returns True for content that can be reliably processed as text.

145 '''

146 mimetype, charset = detect_mimetype_and_charset( content, 'unknown' )

147 return charset is not None and is_textual_mimetype( mimetype )

148

149

150def _is_probable_textual_content( content: str ) -> bool:

151 ''' Validates decoded content using heuristic analysis.

152

153 Applies heuristics to detect meaningful text vs binary data:

154 - Limits control characters to <10% (excluding common whitespace)

155 - Requires >=80% printable characters

156

157 Returns True for content likely to be meaningful text.

158 '''

159 if not content: return False

160 common_whitespace = '\t\n\r'

161 ascii_control_limit = 32

162 control_chars = sum(

163 1 for c in content

164 if ord( c ) < ascii_control_limit and c not in common_whitespace )

165 if control_chars > len( content ) * 0.1: return False

166 printable_chars = sum(

167 1 for c in content

168 if c.isprintable( ) or c in common_whitespace )

169 return printable_chars >= len( content ) * 0.8

170

171

172def _validate_mimetype_with_trial_decode(

173 content: bytes, location: Location, mimetype: str, charset: str

174) -> None:

175 ''' Validates charset fallback and returns appropriate MIME type. '''

176 try: text = content.decode( charset )

177 except ( UnicodeDecodeError, LookupError ) as exc:

178 raise _exceptions.TextualMimetypeInvalidity(

179 str( location ), mimetype ) from exc

180 if not _is_probable_textual_content( text ):

181 raise _exceptions.TextualMimetypeInvalidity(

182 str( location ), mimetype )