Coverage for sources/mimeogram/tokenizers.py: 95%

35 statements  

« prev     ^ index     » next       coverage.py v7.6.12, created at 2025-03-02 23:41 +0000

1# vim: set filetype=python fileencoding=utf-8: 

2# -*- coding: utf-8 -*- 

3 

4#============================================================================# 

5# # 

6# Licensed under the Apache License, Version 2.0 (the "License"); # 

7# you may not use this file except in compliance with the License. # 

8# You may obtain a copy of the License at # 

9# # 

10# http://www.apache.org/licenses/LICENSE-2.0 # 

11# # 

12# Unless required by applicable law or agreed to in writing, software # 

13# distributed under the License is distributed on an "AS IS" BASIS, # 

14# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # 

15# See the License for the specific language governing permissions and # 

16# limitations under the License. # 

17# # 

18#============================================================================# 

19 

20 

21''' Language model tokenizers. ''' 

22 

23 

24from __future__ import annotations 

25 

26import tiktoken as _tiktoken 

27 

28from . import __ 

29 

30 

31_scribe = __.produce_scribe( __name__ ) 

32 

33 

34 

35class Tokenizers( __.enum.Enum ): 

36 ''' Language model tokenizers. ''' 

37 

38 AnthropicApi = 'anthropic-api' 

39 Tiktoken = 'tiktoken' 

40 

41 @classmethod 

42 async def produce( 

43 selfclass, name: str, variant: __.Absential[ str ] = __.absent 

44 ) -> Tokenizer: 

45 ''' Produces tokenizer from name and optional variant. ''' 

46 tokenizer = selfclass( name ) 

47 match tokenizer: 

48 case Tokenizers.AnthropicApi: 

49 raise NotImplementedError( # noqa: TRY003 

50 "Not implemented yet. Sorry." ) 

51 case Tokenizers.Tiktoken: 51 ↛ exitline 51 didn't return from function 'produce' because the pattern on line 51 always matched

52 return await Tiktoken.from_variant( name = variant ) 

53 

54 

55# pylint: disable=invalid-metaclass 

56class Tokenizer( 

57 __.typx.Protocol, 

58 metaclass = __.ImmutableStandardProtocolDataclass, 

59 decorators = ( __.standard_dataclass, __.typx.runtime_checkable, ), 

60): 

61 ''' Language model tokenizer. ''' 

62 

63 @classmethod 

64 @__.abc.abstractmethod 

65 async def from_variant( 

66 selfclass, name: __.Absential[ str ] = __.absent 

67 ) -> __.typx.Self: 

68 ''' Produces instance from name of variant. ''' 

69 

70 @__.abc.abstractmethod 

71 async def count( self, text: str ) -> int: 

72 ''' Counts number of tokens in text. ''' 

73 raise NotImplementedError 

74# pylint: enable=invalid-metaclass 

75 

76 

77# TODO: Implement 'AnthropicApi' tokenizer. 

78 

79 

80class Tiktoken( 

81 Tokenizer, decorators = ( __.standard_dataclass, ) 

82): 

83 ''' Tokenization via 'tiktoken' package. ''' 

84 

85 codec: _tiktoken.Encoding 

86 

87 @classmethod 

88 async def from_variant( 

89 selfclass, name: __.Absential[ str ] = __.absent 

90 ) -> __.typx.Self: 

91 if __.is_absent( name ): name = 'cl100k_base' 

92 from tiktoken import get_encoding 

93 try: codec = get_encoding( name ) 

94 except ValueError as exc: 

95 from .exceptions import TokenizerVariantInvalidity 

96 raise TokenizerVariantInvalidity( 'tiktoken', name ) from exc 

97 return selfclass( codec = codec ) 

98 

99 async def count( self, text: str ) -> int: 

100 return len( self.codec.encode( text ) )