Coverage for sources/mimeogram/tokenizers.py: 95%

34 statements  

« prev     ^ index     » next       coverage.py v7.9.2, created at 2025-07-05 19:46 +0000

1# vim: set filetype=python fileencoding=utf-8: 

2# -*- coding: utf-8 -*- 

3 

4#============================================================================# 

5# # 

6# Licensed under the Apache License, Version 2.0 (the "License"); # 

7# you may not use this file except in compliance with the License. # 

8# You may obtain a copy of the License at # 

9# # 

10# http://www.apache.org/licenses/LICENSE-2.0 # 

11# # 

12# Unless required by applicable law or agreed to in writing, software # 

13# distributed under the License is distributed on an "AS IS" BASIS, # 

14# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # 

15# See the License for the specific language governing permissions and # 

16# limitations under the License. # 

17# # 

18#============================================================================# 

19 

20 

21''' Language model tokenizers. ''' 

22 

23 

24import tiktoken as _tiktoken 

25 

26from . import __ 

27 

28 

29_scribe = __.produce_scribe( __name__ ) 

30 

31 

32 

33class Tokenizers( __.enum.Enum ): 

34 ''' Language model tokenizers. ''' 

35 

36 AnthropicApi = 'anthropic-api' 

37 Tiktoken = 'tiktoken' 

38 

39 @classmethod 

40 async def produce( 

41 selfclass, name: str, variant: __.Absential[ str ] = __.absent 

42 ) -> "Tokenizer": 

43 ''' Produces tokenizer from name and optional variant. ''' 

44 tokenizer = selfclass( name ) 

45 match tokenizer: 

46 case Tokenizers.AnthropicApi: 

47 raise NotImplementedError( "Not implemented yet. Sorry." ) 

48 case Tokenizers.Tiktoken: 48 ↛ exitline 48 didn't return from function 'produce' because the pattern on line 48 always matched

49 return await Tiktoken.from_variant( name = variant ) 

50 

51 

52class Tokenizer( 

53 __.immut.DataclassProtocol, __.typx.Protocol, 

54 decorators = ( __.typx.runtime_checkable, ), 

55): 

56 ''' Language model tokenizer. ''' 

57 

58 @classmethod 

59 @__.abc.abstractmethod 

60 async def from_variant( 

61 selfclass, name: __.Absential[ str ] = __.absent 

62 ) -> __.typx.Self: 

63 ''' Produces instance from name of variant. ''' 

64 

65 @__.abc.abstractmethod 

66 async def count( self, text: str ) -> int: 

67 ''' Counts number of tokens in text. ''' 

68 raise NotImplementedError 

69 

70 

71# TODO: Implement 'AnthropicApi' tokenizer. 

72 

73 

74class Tiktoken( Tokenizer ): 

75 ''' Tokenization via 'tiktoken' package. ''' 

76 

77 codec: _tiktoken.Encoding 

78 

79 @classmethod 

80 async def from_variant( 

81 selfclass, name: __.Absential[ str ] = __.absent 

82 ) -> __.typx.Self: 

83 if __.is_absent( name ): name = 'cl100k_base' 

84 from tiktoken import get_encoding 

85 try: codec = get_encoding( name ) 

86 except ValueError as exc: 

87 from .exceptions import TokenizerVariantInvalidity 

88 raise TokenizerVariantInvalidity( 'tiktoken', name ) from exc 

89 return selfclass( codec = codec ) 

90 

91 async def count( self, text: str ) -> int: 

92 return len( self.codec.encode( text ) )