Coverage for sources/mimeogram/tokenizers.py: 95%

35 statements  

« prev     ^ index     » next       coverage.py v7.8.2, created at 2025-06-07 04:07 +0000

1# vim: set filetype=python fileencoding=utf-8: 

2# -*- coding: utf-8 -*- 

3 

4#============================================================================# 

5# # 

6# Licensed under the Apache License, Version 2.0 (the "License"); # 

7# you may not use this file except in compliance with the License. # 

8# You may obtain a copy of the License at # 

9# # 

10# http://www.apache.org/licenses/LICENSE-2.0 # 

11# # 

12# Unless required by applicable law or agreed to in writing, software # 

13# distributed under the License is distributed on an "AS IS" BASIS, # 

14# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # 

15# See the License for the specific language governing permissions and # 

16# limitations under the License. # 

17# # 

18#============================================================================# 

19 

20 

21''' Language model tokenizers. ''' 

22 

23 

24from __future__ import annotations 

25 

26import tiktoken as _tiktoken 

27 

28from . import __ 

29 

30 

31_scribe = __.produce_scribe( __name__ ) 

32 

33 

34 

35class Tokenizers( __.enum.Enum ): 

36 ''' Language model tokenizers. ''' 

37 

38 AnthropicApi = 'anthropic-api' 

39 Tiktoken = 'tiktoken' 

40 

41 @classmethod 

42 async def produce( 

43 selfclass, name: str, variant: __.Absential[ str ] = __.absent 

44 ) -> Tokenizer: 

45 ''' Produces tokenizer from name and optional variant. ''' 

46 tokenizer = selfclass( name ) 

47 match tokenizer: 

48 case Tokenizers.AnthropicApi: 

49 raise NotImplementedError( "Not implemented yet. Sorry." ) 

50 case Tokenizers.Tiktoken: 50 ↛ exitline 50 didn't return from function 'produce' because the pattern on line 50 always matched

51 return await Tiktoken.from_variant( name = variant ) 

52 

53 

54class Tokenizer( 

55 __.typx.Protocol, 

56 metaclass = __.ImmutableStandardProtocolDataclass, 

57 decorators = ( __.standard_dataclass, __.typx.runtime_checkable, ), 

58): 

59 ''' Language model tokenizer. ''' 

60 

61 @classmethod 

62 @__.abc.abstractmethod 

63 async def from_variant( 

64 selfclass, name: __.Absential[ str ] = __.absent 

65 ) -> __.typx.Self: 

66 ''' Produces instance from name of variant. ''' 

67 

68 @__.abc.abstractmethod 

69 async def count( self, text: str ) -> int: 

70 ''' Counts number of tokens in text. ''' 

71 raise NotImplementedError 

72 

73 

74# TODO: Implement 'AnthropicApi' tokenizer. 

75 

76 

77class Tiktoken( 

78 Tokenizer, decorators = ( __.standard_dataclass, ) 

79): 

80 ''' Tokenization via 'tiktoken' package. ''' 

81 

82 codec: _tiktoken.Encoding 

83 

84 @classmethod 

85 async def from_variant( 

86 selfclass, name: __.Absential[ str ] = __.absent 

87 ) -> __.typx.Self: 

88 if __.is_absent( name ): name = 'cl100k_base' 

89 from tiktoken import get_encoding 

90 try: codec = get_encoding( name ) 

91 except ValueError as exc: 

92 from .exceptions import TokenizerVariantInvalidity 

93 raise TokenizerVariantInvalidity( 'tiktoken', name ) from exc 

94 return selfclass( codec = codec ) 

95 

96 async def count( self, text: str ) -> int: 

97 return len( self.codec.encode( text ) )