1111
1212#include " GPT2BPEFileReader.hpp"
1313#include " TiktokenBPEFileReader.hpp"
14+ #include " tools/file_vault/FileSystemFileVault.hpp"
1415
1516
1617namespace INSTINCT_LLM_NS {
@@ -69,6 +70,29 @@ namespace INSTINCT_LLM_NS {
6970 return std::make_shared<TiktokenTokenizer>(bpe_ranks, vocab, UnicodeString::fromUTF8 (config.pat_str ), config.special_tokens , byte_shuffle);
7071 }
7172
73+ static TokenizerPtr MakeGPT2Tokenizer () {
74+ static bool ONCE = false ;
75+ if (!ONCE) {
76+ FetchHttpGetResourceToFileVault (
77+ DEFAULT_FILE_VAULT,
78+ " tiktoken/gpt2_vocab.bpe" ,
79+ " https://openaipublic.blob.core.windows.net/gpt-2/encodings/main/vocab.bpe" ,
80+ {.algorithm = kSHA256 , .expected_value = " 1ce1664773c50f3e0cc8842619a93edc4624525b728b188a9e0be33b7726adc5" }
81+ ).wait ();
82+
83+ FetchHttpGetResourceToFileVault (
84+ DEFAULT_FILE_VAULT,
85+ " tiktoken/gpt2_encoder.json" ,
86+ " https://openaipublic.blob.core.windows.net/gpt-2/encodings/main/encoder.json" ,
87+ {.algorithm = kSHA256 , .expected_value = " 196139668be63f3b5d6574427317ae82f612a97c5d1cdaf36ed2256dbf636783" }
88+ ).wait ();
89+ }
90+
91+ const auto entry1 = DEFAULT_FILE_VAULT->GetResource (" tiktoken/gpt2_vocab.bpe" ).get ();
92+ const auto entry2 = DEFAULT_FILE_VAULT->GetResource (" tiktoken/gpt2_vocab.bpe" ).get ();
93+ return MakeGPT2Tokenizer (entry1.local_path , entry2.local_path );
94+ }
95+
7296 static TokenizerPtr MakeGPT2Tokenizer (
7397 const std::filesystem::path& bpe_file_path,
7498 const std::filesystem::path& encoder_json_file_path) {
@@ -84,6 +108,20 @@ namespace INSTINCT_LLM_NS {
84108 });
85109 }
86110
111+ static TokenizerPtr MakeGPT4Tokenizer () {
112+ static bool ONCE = false ;
113+ if (!ONCE) {
114+ FetchHttpGetResourceToFileVault (
115+ DEFAULT_FILE_VAULT,
116+ " tiktoken/cl100k_base.tiktoken" ,
117+ " https://openaipublic.blob.core.windows.net/encodings/cl100k_base.tiktoken" ,
118+ {.algorithm = kSHA256 , .expected_value = " 223921b76ee99bde995b7ff738513eef100fb51d18c93597a113bcffe865b2a7" }
119+ ).wait ();
120+ }
121+ const auto entry = DEFAULT_FILE_VAULT->GetResource (" tiktoken/cl100k_base.tiktoken" ).get ();
122+ return MakeGPT4Tokenizer (entry.local_path );
123+ }
124+
87125 static TokenizerPtr MakeGPT4Tokenizer (
88126 const std::filesystem::path& tiktoken_bpe_file_path
89127 ) {
0 commit comments