diff --git a/F2LLM/README.md b/F2LLM/README.md index 6b79819..cf15de0 100644 --- a/F2LLM/README.md +++ b/F2LLM/README.md @@ -26,8 +26,8 @@ In this repo we provide a streamlined and efficient script for training embeddin - Setup environment following `requirements.txt`. We note that transformers>=4.51.0 is required for training Qwen3 models. - Download data and backbone models from Hugging Face (we use Qwen3 models). -- Run `tokenize_data_qwen.py` to tokenize the downloaded data -- Modify model path, data path, and other arguments in `configs/config.json`. +- Run `python tokenize_data_general.py --model_path [--arch encoder|decoder|auto] [--no_append_eos_decoder]` to tokenize the downloaded data for both decoder and encoder models. On mac/CPU, no flash-attn is needed; the model will fall back to eager attention. +- Modify model path, data path, and other arguments in `configs/config.json` (decoder) or `configs/config_bert.json` (encoder). You can also set `model_arch` to force behavior if auto-detect is undesirable. - Start training with `accelerate launch --config_file configs/accelerate_config.yaml run.py --config configs/config.json`. Note: we recommend setting `num_processes` to 1 in `configs/accelerate_config.yaml` and launch the training code once to generate cache for training data before starting the actual training. @@ -42,6 +42,32 @@ where N_NODE is the number of machines; N_PROCESSES is N_NODE\*8; MASTER_IP is t On worker nodes, also run the above commmand but modify `machine_rank` accordingly. +### Support for Encoder-Only Models + +- Decoder-only models: last non-padded token pooling (unchanged); uses flash-attn when available, otherwise falls back to eager. +- Encoder-only models: auto-detected (`BertModel`, `RobertaModel`, `DebertaModel`, `ElectraModel`, `AlbertModel`, `DistilBertModel`) or forced via `model_arch`/`--arch`. +- Pooling options for encoders: `cls` (default), `mean`, `cls_mean` hybrid. +- Tokenization: `tokenize_data_general.py` handles both; you can force `--arch encoder|decoder|auto` and skip EOS appending with `--no_append_eos_decoder`. + +Quick start (encoder): + +``` +python tokenize_data_general.py \ + --model_path bert-base-uncased \ + --data_dir training_data \ + --output_dir training_data/data_tokenized_bert \ + --max_seq_length 512 \ + --num_processes 8 \ + --arch encoder + +accelerate launch --config_file configs/accelerate_config.yaml run.py --config configs/config_bert.json +``` + +Notes and tips +- Typical encoder max length: 512; LR 2e-5 to 5e-5. +- For gated/private HF models, run `huggingface-cli login`. +- On mac/CPU, flash-attn is not required; the code will use eager attention automatically. + ### Citation If you use the F2LLM models, data, or code, please cite the following technical report. diff --git a/F2LLM/arguments.py b/F2LLM/arguments.py index b967c8f..062d00c 100644 --- a/F2LLM/arguments.py +++ b/F2LLM/arguments.py @@ -19,6 +19,10 @@ class Args: min_lr: float = 1e-6 weight_decay: float = 1e-2 warmup_steps: int = 100 + # model architecture: 'decoder' (default) or 'encoder' + model_arch: str = "decoder" + # pooling strategy for embedding: 'last_token' (decoder), 'cls' or 'mean' (encoder) + pooling: str = "last_token" # embedding-related settings num_hard_neg: int = 7 # train steps take precedence over epochs, set to -1 to disable diff --git a/F2LLM/configs/config_bert.json b/F2LLM/configs/config_bert.json new file mode 100644 index 0000000..9d2259c --- /dev/null +++ b/F2LLM/configs/config_bert.json @@ -0,0 +1,19 @@ +{ + "model_path": "bert-base-uncased", + "experiment_id": "bert-base-uncased+lr.2e-5+bs.16x32+context.512+2epochs", + "train_data_path": "training_data/data_tokenized", + "output_dir": "output", + "tb_dir": "output/tb", + "cache_dir": "cache", + "train_batch_size": 16, + "checkpointing_steps": 5000, + "validation_steps": 5000, + "max_seq_length": 512, + "learning_rate": 2e-5, + "min_lr": 1e-7, + "weight_decay": 0.01, + "warmup_steps": 500, + "train_epochs": 2, + "log_interval": 100, + "num_hard_neg": 7 +} diff --git a/F2LLM/model.py b/F2LLM/model.py index d33ade7..e994841 100644 --- a/F2LLM/model.py +++ b/F2LLM/model.py @@ -1,5 +1,5 @@ import torch -from transformers import AutoModel, AutoTokenizer +from transformers import AutoModel, AutoTokenizer, AutoConfig class F2LLM: @@ -12,8 +12,33 @@ def __init__(self, self.args = args self.dtype = torch.bfloat16 self.device = None # set after accelerator.prepare - self.lm = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=self.dtype, attn_implementation='flash_attention_2') - self.lm.config.use_cache = False + config = AutoConfig.from_pretrained(model_path) + encoder_archs = ['BertModel', 'RobertaModel', 'DebertaModel', 'ElectraModel', 'AlbertModel', 'DistilBertModel'] + + # Allow explicit override via args.model_arch; otherwise infer from config + if self.args and getattr(self.args, 'model_arch', None): + arch_flag = self.args.model_arch.lower() + self.is_encoder_only = arch_flag == 'encoder' + else: + self.is_encoder_only = any(arch in getattr(config, 'architectures', []) for arch in encoder_archs) + + # Choose attention impl: prefer flash_attention_2 when available on CUDA for decoders; otherwise fallback to eager + if not self.is_encoder_only and torch.cuda.is_available(): + try: + import flash_attn # noqa: F401 + attn_impl = 'flash_attention_2' + except Exception: + attn_impl = 'eager' + else: + attn_impl = 'eager' + self.lm = AutoModel.from_pretrained( + model_path, + trust_remote_code=True, + torch_dtype=self.dtype, + attn_implementation=attn_impl + ) + if not self.is_encoder_only: + self.lm.config.use_cache = False self.tokenizer = AutoTokenizer.from_pretrained(model_path) self.max_seq_length = max_seq_length @@ -24,14 +49,36 @@ def forward(self, batch): bs = batch['bs'] num_hard_neg = int((len(batch['input_ids']) - 2*bs) / bs) - outputs = self.lm(batch['input_ids'], - batch['attention_mask'], - ) + outputs = self.lm( + batch['input_ids'], + batch['attention_mask'], + ) + + hidden = outputs.last_hidden_state # [total_bs, seq_len, dim] + + # Pooling per-architecture + if self.is_encoder_only: + pooling = getattr(self.args, 'pooling', 'cls') if self.args else 'cls' + if pooling == 'mean': + mask = batch['attention_mask'].unsqueeze(-1) # [B, L, 1] + summed = (hidden * mask).sum(dim=1, keepdim=True) + lengths = mask.sum(dim=1, keepdim=True).clamp_min(1) + pooled = summed / lengths + elif pooling == 'cls_mean': + mask = batch['attention_mask'].unsqueeze(-1) + summed = (hidden * mask).sum(dim=1, keepdim=True) + lengths = mask.sum(dim=1, keepdim=True).clamp_min(1) + mean_pooled = summed / lengths + pooled = 0.5 * (hidden[:, 0:1, :] + mean_pooled) + else: # default CLS + pooled = hidden[:, 0:1, :] + else: + # decoder-style: last non-pad token representation + pooled = torch.stack([hidden[i, [batch['seq_lens'][i]-1]] for i in range(len(batch['seq_lens']))]) - passage_features_all_tokens = outputs.last_hidden_state return { - 'query_passage_features': torch.stack([passage_features_all_tokens[i, [batch['seq_lens'][i]-1]] for i in range(bs)]), - 'passage_passage_features': torch.stack([passage_features_all_tokens[i, [batch['seq_lens'][i]-1]] for i in range(bs, 2*bs)]), - 'negative_passage_features': None if num_hard_neg == 0 else torch.stack([passage_features_all_tokens[i, [batch['seq_lens'][i]-1]] for i in range(2*bs, len(batch['seq_lens']))]).view(bs, num_hard_neg, -1) + 'query_passage_features': pooled[:bs], + 'passage_passage_features': pooled[bs:2*bs], + 'negative_passage_features': None if num_hard_neg == 0 else pooled[2*bs:].view(bs, num_hard_neg, -1) } diff --git a/F2LLM/requirements.txt b/F2LLM/requirements.txt index 82fb447..92ccc18 100644 --- a/F2LLM/requirements.txt +++ b/F2LLM/requirements.txt @@ -1,7 +1,12 @@ accelerate datasets deepspeed -flash-attn +# flash-attn is GPU-only; skip on mac/arm. Pip marker limits install to Linux x86_64. +flash-attn; platform_system == "Linux" and platform_machine == "x86_64" torch transformers tensorboard +scikit-learn +numpy +pandas +pytest \ No newline at end of file diff --git a/F2LLM/run.py b/F2LLM/run.py index e40b707..960bce1 100644 --- a/F2LLM/run.py +++ b/F2LLM/run.py @@ -3,7 +3,8 @@ from transformers import ( AutoTokenizer, set_seed, - get_scheduler + get_scheduler, + AutoConfig ) import os, json, random from datasets import load_dataset @@ -22,6 +23,17 @@ args.num_processes = accelerator.num_processes accelerator.print(args) +# Detect architecture and normalize tokenizer padding +config = AutoConfig.from_pretrained(args.model_path) +encoder_archs = ['BertModel', 'RobertaModel', 'DebertaModel', 'ElectraModel', 'AlbertModel', 'DistilBertModel'] +detected_encoder = any(arch in getattr(config, 'architectures', []) for arch in encoder_archs) +if args.model_arch: + is_encoder_only = args.model_arch.lower() == "encoder" +else: + is_encoder_only = detected_encoder +args.model_arch = "encoder" if is_encoder_only else "decoder" +accelerator.print(f"Model architecture: {'encoder' if is_encoder_only else 'decoder'} | Pooling: {args.pooling}") + def _stack(input_ids, max_len): data = [ids[:max_len] for ids in input_ids] # input_ids: list of lists lens = [len(x) for x in data] @@ -70,6 +82,14 @@ def collate_fn(batch_raw): valid_datasets.append((dataset_name, dataset['test'])) tokenizer = AutoTokenizer.from_pretrained(args.model_path) +if tokenizer.pad_token_id is None: + if tokenizer.eos_token_id is not None: + tokenizer.pad_token = tokenizer.eos_token + elif getattr(tokenizer, 'unk_token', None): + tokenizer.pad_token = tokenizer.unk_token + else: + tokenizer.add_special_tokens({'pad_token': '[PAD]'} ) +tokenizer.padding_side = 'right' train_loaders = { name: DataLoader(ds, shuffle=True, batch_size=args.train_batch_size, collate_fn=collate_fn) diff --git a/F2LLM/smoke_encoder_decoder.py b/F2LLM/smoke_encoder_decoder.py new file mode 100644 index 0000000..9955360 --- /dev/null +++ b/F2LLM/smoke_encoder_decoder.py @@ -0,0 +1,135 @@ +""" +Lightweight smoke checks for encoder/decoder pooling and tokenizer behaviors. +Run: python smoke_encoder_decoder.py +""" +import torch +from tokenize_data_general import process_sent +from model import F2LLM + + +class MockTokenizer: + def __init__(self, eos_token_id=2, pad_token_id=0): + self.eos_token_id = eos_token_id + self.pad_token_id = pad_token_id + + def __call__(self, sentence, max_length, truncation=True, add_special_tokens=False): + # deterministic token ids based on length + base = list(range(1, min(max_length, len(sentence.split())) + 1)) + if add_special_tokens: + ids = [101] + base + if len(ids) < max_length: + ids.append(102) + else: + ids = base + ids = ids[:max_length] + + class Output: + def __init__(self, ids): + self.input_ids = ids + return Output(ids) + + +def test_process_sent_encoder_special_tokens(): + tok = MockTokenizer() + arr = process_sent("hello world", tok, max_seq_length=5, is_encoder_only=True, append_eos_decoder=True) + assert arr[0] == 101, "CLS should be first" + assert arr[-1] == 102, "SEP should be last when room remains" + + +def test_process_sent_decoder_eos_appended(): + tok = MockTokenizer(eos_token_id=9) + arr = process_sent("a b c", tok, max_seq_length=6, is_encoder_only=False, append_eos_decoder=True) + assert arr[-1] == 9, "EOS should be appended for decoder when enabled" + + +def test_process_sent_decoder_skip_eos(): + tok = MockTokenizer(eos_token_id=9) + arr = process_sent("a b c", tok, max_seq_length=6, is_encoder_only=False, append_eos_decoder=False) + assert arr[-1] != 9, "EOS should not be appended when disabled" + + +def test_encoder_pooling_variants(): + class Args: + pooling = "cls" + model_arch = "encoder" + args = Args() + model = F2LLM.__new__(F2LLM) + model.args = args + model.is_encoder_only = True + bs = 2 + num_hard_neg = 1 + seq_lens = torch.tensor([5, 6, 7, 8, 5, 6]) + hidden = torch.randn(bs * (2 + num_hard_neg), 10, 4) + attn_mask = torch.ones(bs * (2 + num_hard_neg), 10, dtype=torch.long) + batch = { + 'input_ids': torch.zeros_like(attn_mask), + 'attention_mask': attn_mask, + 'seq_lens': seq_lens, + 'bs': bs + } + class MockLM: + def __call__(self, input_ids, attention_mask): + class Output: + last_hidden_state = hidden + return Output() + model.lm = MockLM() + model.lm.device = hidden.device + model.forward = F2LLM.forward.__get__(model, F2LLM) + + out_cls = model.forward(batch) + assert out_cls['query_passage_features'].shape == (bs, 1, hidden.size(-1)) + + model.args.pooling = "mean" + out_mean = model.forward(batch) + assert out_mean['query_passage_features'].shape == (bs, 1, hidden.size(-1)) + + model.args.pooling = "cls_mean" + out_cls_mean = model.forward(batch) + assert out_cls_mean['query_passage_features'].shape == (bs, 1, hidden.size(-1)) + + +def test_decoder_pooling_last_token(): + model = F2LLM.__new__(F2LLM) + model.args = None + model.is_encoder_only = False + bs = 2 + num_hard_neg = 1 + seq_lens = torch.tensor([2, 3, 4, 5, 6, 7]) + hidden = torch.randn(bs * (2 + num_hard_neg), 8, 4) + attn_mask = torch.ones(bs * (2 + num_hard_neg), 8, dtype=torch.long) + batch = { + 'input_ids': torch.zeros_like(attn_mask), + 'attention_mask': attn_mask, + 'seq_lens': seq_lens, + 'bs': bs + } + class MockLM: + def __call__(self, input_ids, attention_mask): + class Output: + last_hidden_state = hidden + return Output() + model.lm = MockLM() + model.lm.device = hidden.device + model.forward = F2LLM.forward.__get__(model, F2LLM) + + out = model.forward(batch) + assert out['query_passage_features'].shape == (bs, 1, hidden.size(-1)) + assert out['negative_passage_features'].shape == (bs, num_hard_neg, hidden.size(-1)) + + +def main(): + tests = [ + test_process_sent_encoder_special_tokens, + test_process_sent_decoder_eos_appended, + test_process_sent_decoder_skip_eos, + test_encoder_pooling_variants, + test_decoder_pooling_last_token, + ] + for t in tests: + t() + print(f"{t.__name__}: ok") + print("All smoke tests passed.") + + +if __name__ == "__main__": + main() diff --git a/F2LLM/tokenize_data_general.py b/F2LLM/tokenize_data_general.py new file mode 100644 index 0000000..847e5c9 --- /dev/null +++ b/F2LLM/tokenize_data_general.py @@ -0,0 +1,91 @@ +from multiprocessing import Pool +import numpy as np +import pandas as pd +import os +from transformers import AutoTokenizer, AutoConfig +from tqdm.auto import tqdm +import argparse + +def process_sent(sentence, tokenizer, max_seq_length, is_encoder_only, append_eos_decoder=True): + if is_encoder_only: + tokenizer_outputs = tokenizer(sentence, max_length=max_seq_length, truncation=True, add_special_tokens=True) + else: + tokenizer_outputs = tokenizer(sentence, max_length=max_seq_length, truncation=True, add_special_tokens=False) + if append_eos_decoder and tokenizer.eos_token_id is not None: + if tokenizer_outputs.input_ids and tokenizer_outputs.input_ids[-1] != tokenizer.eos_token_id: + tokenizer_outputs.input_ids.append(tokenizer.eos_token_id) + return np.array(tokenizer_outputs.input_ids) + + +def process_sent_batch(data, tokenizer, max_seq_length, is_encoder_only, append_eos_decoder): + return data.apply(lambda x: process_sent(x, tokenizer, max_seq_length, is_encoder_only, append_eos_decoder)) + + +def parallelize_apply(data, tokenizer, max_seq_length, is_encoder_only, append_eos_decoder, num_of_processes=8): + indices = np.array_split(data.index, num_of_processes) + data_split = [data.iloc[idx] for idx in indices] + args_list = [(ds, tokenizer, max_seq_length, is_encoder_only, append_eos_decoder) for ds in data_split] + with Pool(num_of_processes) as pool: + results = pool.starmap(process_sent_batch, args_list) + return pd.concat(results) + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("--model_path", type=str, required=True, help="Path to the model") + parser.add_argument("--data_dir", type=str, default='training_data', help="Directory containing training data") + parser.add_argument("--output_dir", type=str, default='data_tokenized', help="Directory to save tokenized data") + parser.add_argument("--max_seq_length", type=int, default=512, help="Maximum sequence length") + parser.add_argument("--num_processes", type=int, default=8, help="Number of processes for parallel tokenization") + parser.add_argument("--arch", type=str, choices=["auto", "encoder", "decoder"], default="auto", help="Force encoder/decoder tokenization behavior") + parser.add_argument("--no_append_eos_decoder", action="store_true", help="Skip appending EOS for decoder models") + args = parser.parse_args() + + tokenizer = AutoTokenizer.from_pretrained(args.model_path) + config = AutoConfig.from_pretrained(args.model_path) + encoder_archs = ['BertModel', 'RobertaModel', 'DebertaModel', 'ElectraModel', 'AlbertModel', 'DistilBertModel'] + detected_encoder = any(arch in getattr(config, 'architectures', []) for arch in encoder_archs) + if args.arch != "auto": + is_encoder_only = args.arch == "encoder" + else: + is_encoder_only = detected_encoder + + append_eos_decoder = not args.no_append_eos_decoder + + if not is_encoder_only and append_eos_decoder and tokenizer.eos_token_id is None: + if tokenizer.pad_token_id is not None: + tokenizer.eos_token_id = tokenizer.pad_token_id + elif getattr(tokenizer, 'unk_token_id', None) is not None: + tokenizer.eos_token_id = tokenizer.unk_token_id + else: + tokenizer.eos_token_id = 0 + + max_seq_length = args.max_seq_length - 2 if is_encoder_only else args.max_seq_length + + root_dir = args.data_dir + output_dir = args.output_dir + os.makedirs(output_dir, exist_ok=True) + + for ds_name in tqdm(sorted(os.listdir(root_dir))): + print(ds_name, flush=True) + df = pd.read_parquet(f"{root_dir}/{ds_name}") + df['query_input_ids'] = parallelize_apply(df['query'], tokenizer, max_seq_length, is_encoder_only, append_eos_decoder, args.num_processes) + + num_neg = 24 if 'negative_2' in df.keys() else 1 + texts = df.passage.to_list() + for i in range(1, num_neg+1): + texts += df[f'negative_{i}'].to_list() + texts = list(set(texts)) + df_tmp = pd.DataFrame({'text': texts}) + df_tmp['input_ids'] = parallelize_apply(df_tmp['text'], tokenizer, max_seq_length, is_encoder_only, append_eos_decoder, args.num_processes) + df_tmp = df_tmp.set_index('text') + + df['passage_input_ids'] = df.passage.map(df_tmp.input_ids) + for i in range(1, num_neg+1): + df[f'negative_{i}_input_ids'] = df[f'negative_{i}'].map(df_tmp.input_ids) + + df.to_parquet(f'{output_dir}/{ds_name}', index=False) + + +if __name__ == "__main__": + main() diff --git a/README.md b/README.md index 0a2dea2..1804ebd 100644 --- a/README.md +++ b/README.md @@ -10,6 +10,36 @@ Embedding-related repos from CodeFuse, including: - [D2LLM](https://github.com/codefuse-ai/D2LLM) - [F2LLM](./F2LLM/README.md) +### Encoder-Only Model Support + +You can now fine-tune encoder-only (BERT-style) models in F2LLM for embedding tasks. Configure via `model_arch` and `pooling` in the F2LLM config: + +- **`model_arch`**: `encoder` for encoder-only models; defaults to `decoder`. +- **`pooling`**: `cls` (recommended for encoders), `mean`, or `last_token`. + +Example: + +``` +{ + "model_path": "bert-base-uncased", + "experiment_id": "bert-enc-embeds", + "output_dir": "output", + "tb_dir": "output/tb", + "cache_dir": "cache", + "train_data_path": "F2LLM/training_data/data_tokenized", + "train_batch_size": 16, + "max_seq_length": 512, + "learning_rate": 1e-4, + "model_arch": "encoder", + "pooling": "cls" +} +``` + +Notes: +- Encoder defaults to `[CLS]` pooling; `mean` averages non-pad tokens. +- Decoder uses `last_token` pooling (existing behavior). +- Hugging Face tokenizers will handle special tokens for encoders automatically. + **Star History** [![Star History Chart](https://api.star-history.com/svg?repos=codefuse-ai/CodeFuse-Embeddings&type=date&legend=top-left)](https://www.star-history.com/#codefuse-ai/CodeFuse-Embeddings&type=date&legend=top-left)