Coverage for sources/mimeogram/tokenizers.py: 95%
35 statements
« prev ^ index » next coverage.py v7.6.12, created at 2025-03-02 23:41 +0000
« prev ^ index » next coverage.py v7.6.12, created at 2025-03-02 23:41 +0000
1# vim: set filetype=python fileencoding=utf-8:
2# -*- coding: utf-8 -*-
4#============================================================================#
5# #
6# Licensed under the Apache License, Version 2.0 (the "License"); #
7# you may not use this file except in compliance with the License. #
8# You may obtain a copy of the License at #
9# #
10# http://www.apache.org/licenses/LICENSE-2.0 #
11# #
12# Unless required by applicable law or agreed to in writing, software #
13# distributed under the License is distributed on an "AS IS" BASIS, #
14# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. #
15# See the License for the specific language governing permissions and #
16# limitations under the License. #
17# #
18#============================================================================#
21''' Language model tokenizers. '''
24from __future__ import annotations
26import tiktoken as _tiktoken
28from . import __
31_scribe = __.produce_scribe( __name__ )
35class Tokenizers( __.enum.Enum ):
36 ''' Language model tokenizers. '''
38 AnthropicApi = 'anthropic-api'
39 Tiktoken = 'tiktoken'
41 @classmethod
42 async def produce(
43 selfclass, name: str, variant: __.Absential[ str ] = __.absent
44 ) -> Tokenizer:
45 ''' Produces tokenizer from name and optional variant. '''
46 tokenizer = selfclass( name )
47 match tokenizer:
48 case Tokenizers.AnthropicApi:
49 raise NotImplementedError( # noqa: TRY003
50 "Not implemented yet. Sorry." )
51 case Tokenizers.Tiktoken: 51 ↛ exitline 51 didn't return from function 'produce' because the pattern on line 51 always matched
52 return await Tiktoken.from_variant( name = variant )
55# pylint: disable=invalid-metaclass
56class Tokenizer(
57 __.typx.Protocol,
58 metaclass = __.ImmutableStandardProtocolDataclass,
59 decorators = ( __.standard_dataclass, __.typx.runtime_checkable, ),
60):
61 ''' Language model tokenizer. '''
63 @classmethod
64 @__.abc.abstractmethod
65 async def from_variant(
66 selfclass, name: __.Absential[ str ] = __.absent
67 ) -> __.typx.Self:
68 ''' Produces instance from name of variant. '''
70 @__.abc.abstractmethod
71 async def count( self, text: str ) -> int:
72 ''' Counts number of tokens in text. '''
73 raise NotImplementedError
74# pylint: enable=invalid-metaclass
77# TODO: Implement 'AnthropicApi' tokenizer.
80class Tiktoken(
81 Tokenizer, decorators = ( __.standard_dataclass, )
82):
83 ''' Tokenization via 'tiktoken' package. '''
85 codec: _tiktoken.Encoding
87 @classmethod
88 async def from_variant(
89 selfclass, name: __.Absential[ str ] = __.absent
90 ) -> __.typx.Self:
91 if __.is_absent( name ): name = 'cl100k_base'
92 from tiktoken import get_encoding
93 try: codec = get_encoding( name )
94 except ValueError as exc:
95 from .exceptions import TokenizerVariantInvalidity
96 raise TokenizerVariantInvalidity( 'tiktoken', name ) from exc
97 return selfclass( codec = codec )
99 async def count( self, text: str ) -> int:
100 return len( self.codec.encode( text ) )