Coverage for sources/mimeogram/tokenizers.py: 95%
34 statements
« prev ^ index » next coverage.py v7.9.2, created at 2025-07-05 19:15 +0000
« prev ^ index » next coverage.py v7.9.2, created at 2025-07-05 19:15 +0000
1# vim: set filetype=python fileencoding=utf-8:
2# -*- coding: utf-8 -*-
4#============================================================================#
5# #
6# Licensed under the Apache License, Version 2.0 (the "License"); #
7# you may not use this file except in compliance with the License. #
8# You may obtain a copy of the License at #
9# #
10# http://www.apache.org/licenses/LICENSE-2.0 #
11# #
12# Unless required by applicable law or agreed to in writing, software #
13# distributed under the License is distributed on an "AS IS" BASIS, #
14# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. #
15# See the License for the specific language governing permissions and #
16# limitations under the License. #
17# #
18#============================================================================#
21''' Language model tokenizers. '''
24import tiktoken as _tiktoken
26from . import __
29_scribe = __.produce_scribe( __name__ )
33class Tokenizers( __.enum.Enum ):
34 ''' Language model tokenizers. '''
36 AnthropicApi = 'anthropic-api'
37 Tiktoken = 'tiktoken'
39 @classmethod
40 async def produce(
41 selfclass, name: str, variant: __.Absential[ str ] = __.absent
42 ) -> "Tokenizer":
43 ''' Produces tokenizer from name and optional variant. '''
44 tokenizer = selfclass( name )
45 match tokenizer:
46 case Tokenizers.AnthropicApi:
47 raise NotImplementedError( "Not implemented yet. Sorry." )
48 case Tokenizers.Tiktoken: 48 ↛ exitline 48 didn't return from function 'produce' because the pattern on line 48 always matched
49 return await Tiktoken.from_variant( name = variant )
52class Tokenizer(
53 __.immut.DataclassProtocol, __.typx.Protocol,
54 decorators = ( __.typx.runtime_checkable, ),
55):
56 ''' Language model tokenizer. '''
58 @classmethod
59 @__.abc.abstractmethod
60 async def from_variant(
61 selfclass, name: __.Absential[ str ] = __.absent
62 ) -> __.typx.Self:
63 ''' Produces instance from name of variant. '''
65 @__.abc.abstractmethod
66 async def count( self, text: str ) -> int:
67 ''' Counts number of tokens in text. '''
68 raise NotImplementedError
71# TODO: Implement 'AnthropicApi' tokenizer.
74class Tiktoken( Tokenizer ):
75 ''' Tokenization via 'tiktoken' package. '''
77 codec: _tiktoken.Encoding
79 @classmethod
80 async def from_variant(
81 selfclass, name: __.Absential[ str ] = __.absent
82 ) -> __.typx.Self:
83 if __.is_absent( name ): name = 'cl100k_base'
84 from tiktoken import get_encoding
85 try: codec = get_encoding( name )
86 except ValueError as exc:
87 from .exceptions import TokenizerVariantInvalidity
88 raise TokenizerVariantInvalidity( 'tiktoken', name ) from exc
89 return selfclass( codec = codec )
91 async def count( self, text: str ) -> int:
92 return len( self.codec.encode( text ) )