Skip to content

Commit c50cde5

Browse files
committed
feat: auto download tiktoken files
1 parent 53ccc88 commit c50cde5

4 files changed

Lines changed: 44 additions & 2 deletions

File tree

modules/instinct-core/include/tools/HashUtils.hpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ namespace INSTINCT_CORE_NS {
4040
requires IsHashImplementation<Hash>
4141
static std::string HashForStream(std::istream& input_stream) {
4242
Hash hash;
43-
static constexpr size_t BUFFER_SIZE = 144*7*1024;
43+
static constexpr size_t BUFFER_SIZE = 1024;
4444
char buf[BUFFER_SIZE];
4545
while (input_stream) {
4646
input_stream.read(buf, BUFFER_SIZE);

modules/instinct-core/include/tools/file_vault/FileSystemFileVault.hpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -137,6 +137,7 @@ namespace INSTINCT_CORE_NS {
137137
std::filesystem::path root_directory_;
138138
std::unordered_map<std::string, FileVaultResourceProviderPtr> resources_;
139139
std::mutex write_mutex_;
140+
std::unordered_map<std::string, std::mutex> resource_mutexes_;
140141

141142
public:
142143
explicit FileSystemFileVault(std::filesystem::path root_directory)
@@ -159,6 +160,9 @@ namespace INSTINCT_CORE_NS {
159160
return std::async(std::launch::async, [&] {
160161
assert_true(resources_.contains(named_resource),
161162
fmt::format("Resource {} should exist in vault", named_resource));
163+
164+
// lock and get resource provider
165+
std::lock_guard lock(resource_mutexes_[named_resource]);
162166
const auto resource_provider = resources_.at(named_resource);
163167

164168
// get resource paths

modules/instinct-examples/doc-agent/src/doc-agent.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -141,7 +141,7 @@ namespace insintct::exmaples::doc_agent {
141141
LOG_INFO("CreateChunkedMultiVectorRetriever");
142142

143143
// default to use tiktoken tokenizer
144-
auto tokenizer = TiktokenTokenizer::MakeGPT4Tokenizer("/Users/robinqu/Downloads/cl100k_base.tiktoken");
144+
auto tokenizer = TiktokenTokenizer::MakeGPT4Tokenizer();
145145

146146
const auto child_spliter = CreateRecursiveCharacterTextSplitter(tokenizer, {.chunk_size = retriever_options.child_chunk_size});
147147
if (retriever_options.parent_chunk_size > 0) {

modules/instinct-llm/include/tokenizer/TiktokenTokenizer.hpp

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111

1212
#include "GPT2BPEFileReader.hpp"
1313
#include "TiktokenBPEFileReader.hpp"
14+
#include "tools/file_vault/FileSystemFileVault.hpp"
1415

1516

1617
namespace INSTINCT_LLM_NS {
@@ -69,6 +70,29 @@ namespace INSTINCT_LLM_NS {
6970
return std::make_shared<TiktokenTokenizer>(bpe_ranks, vocab, UnicodeString::fromUTF8(config.pat_str), config.special_tokens, byte_shuffle);
7071
}
7172

73+
static TokenizerPtr MakeGPT2Tokenizer() {
74+
static bool ONCE = false;
75+
if(!ONCE) {
76+
FetchHttpGetResourceToFileVault(
77+
DEFAULT_FILE_VAULT,
78+
"tiktoken/gpt2_vocab.bpe",
79+
"https://openaipublic.blob.core.windows.net/gpt-2/encodings/main/vocab.bpe",
80+
{.algorithm = kSHA256, .expected_value = "1ce1664773c50f3e0cc8842619a93edc4624525b728b188a9e0be33b7726adc5"}
81+
).wait();
82+
83+
FetchHttpGetResourceToFileVault(
84+
DEFAULT_FILE_VAULT,
85+
"tiktoken/gpt2_encoder.json",
86+
"https://openaipublic.blob.core.windows.net/gpt-2/encodings/main/encoder.json",
87+
{.algorithm = kSHA256, .expected_value = "196139668be63f3b5d6574427317ae82f612a97c5d1cdaf36ed2256dbf636783"}
88+
).wait();
89+
}
90+
91+
const auto entry1 = DEFAULT_FILE_VAULT->GetResource("tiktoken/gpt2_vocab.bpe").get();
92+
const auto entry2 = DEFAULT_FILE_VAULT->GetResource("tiktoken/gpt2_vocab.bpe").get();
93+
return MakeGPT2Tokenizer(entry1.local_path, entry2.local_path);
94+
}
95+
7296
static TokenizerPtr MakeGPT2Tokenizer(
7397
const std::filesystem::path& bpe_file_path,
7498
const std::filesystem::path& encoder_json_file_path) {
@@ -84,6 +108,20 @@ namespace INSTINCT_LLM_NS {
84108
});
85109
}
86110

111+
static TokenizerPtr MakeGPT4Tokenizer() {
112+
static bool ONCE = false;
113+
if(!ONCE) {
114+
FetchHttpGetResourceToFileVault(
115+
DEFAULT_FILE_VAULT,
116+
"tiktoken/cl100k_base.tiktoken",
117+
"https://openaipublic.blob.core.windows.net/encodings/cl100k_base.tiktoken",
118+
{.algorithm = kSHA256, .expected_value = "223921b76ee99bde995b7ff738513eef100fb51d18c93597a113bcffe865b2a7"}
119+
).wait();
120+
}
121+
const auto entry = DEFAULT_FILE_VAULT->GetResource("tiktoken/cl100k_base.tiktoken").get();
122+
return MakeGPT4Tokenizer(entry.local_path);
123+
}
124+
87125
static TokenizerPtr MakeGPT4Tokenizer(
88126
const std::filesystem::path& tiktoken_bpe_file_path
89127
) {

0 commit comments

Comments
 (0)