Coverage for sources/detextive/detection.py: 100%
58 statements
« prev ^ index » next coverage.py v7.10.3, created at 2025-08-12 18:11 +0000
« prev ^ index » next coverage.py v7.10.3, created at 2025-08-12 18:11 +0000
1# vim: set filetype=python fileencoding=utf-8:
2# -*- coding: utf-8 -*-
4#============================================================================#
5# #
6# Licensed under the Apache License, Version 2.0 (the "License"); #
7# you may not use this file except in compliance with the License. #
8# You may obtain a copy of the License at #
9# #
10# http://www.apache.org/licenses/LICENSE-2.0 #
11# #
12# Unless required by applicable law or agreed to in writing, software #
13# distributed under the License is distributed on an "AS IS" BASIS, #
14# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. #
15# See the License for the specific language governing permissions and #
16# limitations under the License. #
17# #
18#============================================================================#
21''' Core detection function implementations. '''
24from . import __
25from . import exceptions as _exceptions
28Content: __.typx.TypeAlias = __.typx.Annotated[
29 bytes,
30 __.ddoc.Doc( "Raw byte content for analysis." )
31]
32Location: __.typx.TypeAlias = __.typx.Annotated[
33 str | __.Path,
34 __.ddoc.Doc( "File path, URL, or path components for context." )
35]
37_TEXTUAL_MIME_TYPES = frozenset( (
38 'application/ecmascript',
39 'application/graphql',
40 'application/javascript',
41 'application/json',
42 'application/ld+json',
43 'application/x-httpd-php',
44 'application/x-javascript',
45 'application/x-latex',
46 'application/x-perl',
47 'application/x-php',
48 'application/x-python',
49 'application/x-ruby',
50 'application/x-shell',
51 'application/x-tex',
52 'application/x-yaml',
53 'application/xhtml+xml',
54 'application/xml',
55 'application/yaml',
56 'image/svg+xml',
57) )
58_TEXTUAL_SUFFIXES = ( '+xml', '+json', '+yaml', '+toml' )
61def detect_charset( content: Content ) -> __.typx.Optional[ str ]:
62 ''' Detects character encoding with UTF-8 preference and validation.
64 Returns None if no reliable encoding can be determined.
65 '''
66 result = __.chardet.detect( content )
67 charset = result[ 'encoding' ]
68 if charset is None: return charset
69 if charset.startswith( 'utf' ): return charset
70 match charset:
71 case 'ascii': return 'utf-8' # Assume superset
72 case _: pass
73 # Shake out false positives, like 'MacRoman'
74 try: content.decode( 'utf-8' )
75 except UnicodeDecodeError: return charset
76 return 'utf-8'
79def detect_mimetype(
80 content: Content,
81 location: Location
82) -> __.typx.Optional[ str ]:
83 ''' Detects MIME type using content analysis and extension fallback.
85 Returns standardized MIME type strings or None if detection fails.
86 '''
87 try: return __.puremagic.from_string( content, mime = True )
88 except ( __.puremagic.PureError, ValueError ):
89 return __.mimetypes.guess_type( str( location ) )[ 0 ]
92def detect_mimetype_and_charset(
93 content: Content,
94 location: Location, *,
95 mimetype: __.Absential[ str ] = __.absent,
96 charset: __.Absential[ str ] = __.absent,
97) -> tuple[ str, __.typx.Optional[ str ] ]:
98 ''' Detects MIME type and charset with optional parameter overrides.
100 Returns tuple of (mimetype, charset). MIME type defaults to
101 'text/plain' if charset detected but MIME type unknown, or
102 'application/octet-stream' if neither detected.
103 '''
104 mimetype_ = (
105 detect_mimetype( content, location )
106 if __.is_absent( mimetype ) else mimetype )
107 charset_ = (
108 detect_charset( content ) if __.is_absent( charset ) else charset )
109 if not mimetype_:
110 if charset_:
111 mimetype_ = 'text/plain'
112 try:
113 _validate_mimetype_with_trial_decode(
114 content, str( location ), mimetype_, charset_ )
115 except _exceptions.TextualMimetypeInvalidity: pass
116 else: return mimetype_, charset_
117 mimetype_ = 'application/octet-stream'
118 if is_textual_mimetype( mimetype_ ): return mimetype_, charset_
119 if not __.is_absent( charset ):
120 _validate_mimetype_with_trial_decode(
121 content, str( location ), mimetype_, charset )
122 return mimetype_, charset
123 return mimetype_, None # no charset for non-textual content
126def is_textual_mimetype( mimetype: str ) -> bool:
127 ''' Validates if MIME type represents textual content.
129 Consolidates textual MIME type patterns from all source
130 implementations. Supports text/* prefix, specific application
131 types (JSON, XML, JavaScript, etc.), and textual suffixes
132 (+xml, +json, +yaml, +toml).
134 Returns True for MIME types representing textual content.
135 '''
136 if mimetype.startswith( ( 'text/', 'text/x-' ) ): return True
137 if mimetype in _TEXTUAL_MIME_TYPES: return True
138 return mimetype.endswith( _TEXTUAL_SUFFIXES )
141def is_textual_content( content: bytes ) -> bool:
142 ''' Determines if byte content represents textual data.
144 Returns True for content that can be reliably processed as text.
145 '''
146 mimetype, charset = detect_mimetype_and_charset( content, 'unknown' )
147 return charset is not None and is_textual_mimetype( mimetype )
150def _is_probable_textual_content( content: str ) -> bool:
151 ''' Validates decoded content using heuristic analysis.
153 Applies heuristics to detect meaningful text vs binary data:
154 - Limits control characters to <10% (excluding common whitespace)
155 - Requires >=80% printable characters
157 Returns True for content likely to be meaningful text.
158 '''
159 if not content: return False
160 common_whitespace = '\t\n\r'
161 ascii_control_limit = 32
162 control_chars = sum(
163 1 for c in content
164 if ord( c ) < ascii_control_limit and c not in common_whitespace )
165 if control_chars > len( content ) * 0.1: return False
166 printable_chars = sum(
167 1 for c in content
168 if c.isprintable( ) or c in common_whitespace )
169 return printable_chars >= len( content ) * 0.8
172def _validate_mimetype_with_trial_decode(
173 content: bytes, location: Location, mimetype: str, charset: str
174) -> None:
175 ''' Validates charset fallback and returns appropriate MIME type. '''
176 try: text = content.decode( charset )
177 except ( UnicodeDecodeError, LookupError ) as exc:
178 raise _exceptions.TextualMimetypeInvalidity(
179 str( location ), mimetype ) from exc
180 if not _is_probable_textual_content( text ):
181 raise _exceptions.TextualMimetypeInvalidity(
182 str( location ), mimetype )