Coverage for sources/mimeogram/tokenizers.py: 95%
35 statements
« prev ^ index » next coverage.py v7.8.2, created at 2025-06-07 04:07 +0000
« prev ^ index » next coverage.py v7.8.2, created at 2025-06-07 04:07 +0000
1# vim: set filetype=python fileencoding=utf-8:
2# -*- coding: utf-8 -*-
4#============================================================================#
5# #
6# Licensed under the Apache License, Version 2.0 (the "License"); #
7# you may not use this file except in compliance with the License. #
8# You may obtain a copy of the License at #
9# #
10# http://www.apache.org/licenses/LICENSE-2.0 #
11# #
12# Unless required by applicable law or agreed to in writing, software #
13# distributed under the License is distributed on an "AS IS" BASIS, #
14# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. #
15# See the License for the specific language governing permissions and #
16# limitations under the License. #
17# #
18#============================================================================#
21''' Language model tokenizers. '''
24from __future__ import annotations
26import tiktoken as _tiktoken
28from . import __
31_scribe = __.produce_scribe( __name__ )
35class Tokenizers( __.enum.Enum ):
36 ''' Language model tokenizers. '''
38 AnthropicApi = 'anthropic-api'
39 Tiktoken = 'tiktoken'
41 @classmethod
42 async def produce(
43 selfclass, name: str, variant: __.Absential[ str ] = __.absent
44 ) -> Tokenizer:
45 ''' Produces tokenizer from name and optional variant. '''
46 tokenizer = selfclass( name )
47 match tokenizer:
48 case Tokenizers.AnthropicApi:
49 raise NotImplementedError( "Not implemented yet. Sorry." )
50 case Tokenizers.Tiktoken: 50 ↛ exitline 50 didn't return from function 'produce' because the pattern on line 50 always matched
51 return await Tiktoken.from_variant( name = variant )
54class Tokenizer(
55 __.typx.Protocol,
56 metaclass = __.ImmutableStandardProtocolDataclass,
57 decorators = ( __.standard_dataclass, __.typx.runtime_checkable, ),
58):
59 ''' Language model tokenizer. '''
61 @classmethod
62 @__.abc.abstractmethod
63 async def from_variant(
64 selfclass, name: __.Absential[ str ] = __.absent
65 ) -> __.typx.Self:
66 ''' Produces instance from name of variant. '''
68 @__.abc.abstractmethod
69 async def count( self, text: str ) -> int:
70 ''' Counts number of tokens in text. '''
71 raise NotImplementedError
74# TODO: Implement 'AnthropicApi' tokenizer.
77class Tiktoken(
78 Tokenizer, decorators = ( __.standard_dataclass, )
79):
80 ''' Tokenization via 'tiktoken' package. '''
82 codec: _tiktoken.Encoding
84 @classmethod
85 async def from_variant(
86 selfclass, name: __.Absential[ str ] = __.absent
87 ) -> __.typx.Self:
88 if __.is_absent( name ): name = 'cl100k_base'
89 from tiktoken import get_encoding
90 try: codec = get_encoding( name )
91 except ValueError as exc:
92 from .exceptions import TokenizerVariantInvalidity
93 raise TokenizerVariantInvalidity( 'tiktoken', name ) from exc
94 return selfclass( codec = codec )
96 async def count( self, text: str ) -> int:
97 return len( self.codec.encode( text ) )