AI-Hypercomputer
diff --git a/‎end_to_end/tpu/deepseek/Run_DeepSeek.md‎
Lines changed: 1 addition & 2 deletions b/‎end_to_end/tpu/deepseek/Run_DeepSeek.md‎
Lines changed: 1 addition & 2 deletions
diff --git a/‎end_to_end/tpu/deepseek/v2-16b/test_deepseek.sh‎
Lines changed: 64 additions & 34 deletions b/‎end_to_end/tpu/deepseek/v2-16b/test_deepseek.sh‎
Lines changed: 64 additions & 34 deletions
diff --git a/‎end_to_end/tpu/deepseek/v3-671b/1_test_deepseek.sh‎
Lines changed: 42 additions & 0 deletions b/‎end_to_end/tpu/deepseek/v3-671b/1_test_deepseek.sh‎
Lines changed: 42 additions & 0 deletions
@@ -151,7 +151,7 @@ python3 -m MaxText.decode src/MaxText/configs/base.yml \
     max_target_length=1024 \
     tokenizer_type=huggingface \
     tokenizer_path=deepseek-ai/DeepSeek-V3 \
-    attention=flash \
+    attention=dot_product \
     dtype=bfloat16 \
     weight_dtype=bfloat16 \
     megablox=False \
@@ -197,7 +197,6 @@ python3 -m tests.forward_pass_logit_checker \
     model_name=deepseek2-16b \
     max_prefill_predict_length=4 \
     max_target_length=4 \
-    dataset_type=synthetic \
     scan_layers=false \
     sparse_matmul=False \
     dtype=float32 \
 
@@ -3,49 +3,79 @@
 # This file is documentation for how to get started with DeepSeek v2-Lite on v5p-8.
 
 # The flow of this file is as follows:
-# 1. Convert the checkpoint downloaded from HuggingFace to make it compatible with MaxText.
-# 2. Convert the scanned checkpoint from step 1 into unscanned checkpoint format and run more efficient decoding.
-# 3. Run logits check test between Huggingface and MaxText.
-# 4. Run pre-training, fine-tuning, and decoding.
+# 1. Convert the HuggingFace checkpoint (bf16) to MaxText-compatible checkpoint (bf16): 
+#    Scanned format is better for training; unscanned format is better for decoding.
+# 2. Run logit check, pre-training, fine-tuning, and decoding.
+
+# Example Usage: export HF_TOKEN=<huggingface_access_token>; export BASE_OUTPUT_PATH=<GCS_bucket_path>; bash test_deepseek.sh
+
+# The golden logit can be generated by:
+# python3 -m MaxText.scratch_code.generate_hf_golden_logits --model-id=deepseek-ai/DeepSeek-V2-Lite --output-path=golden_data_deepseek2-16b.jsonl --prompts='I love to;Today is a;What is the' --hf-model-path=$local_bf16_path --not-trust-remote-code
 
 set -ex
-idx=$(date +%Y-%m-%d-%H-%M)
+
 export MODEL_NAME='deepseek2-16b'
 export TOKENIZER_PATH='deepseek-ai/DeepSeek-V2-Lite'
 
-# Installing torch for deps in forward_pass_logit_checker.py
+# Installing torch for checkpoint conversion and forward_pass_logit_checker.py
 python3 -m pip install torch --index-url https://download.pytorch.org/whl/cpu
 
-# Step 1:
-# After downloading checkpoints, copy them to GCS bucket at $CHKPT_BUCKET \
-# Non-Googlers please remember to use separate GCS paths for uploading model weights from HuggingFace ($CHKPT_BUCKET) and MaxText compatible weights ($MODEL_BUCKET).
-# Non-Googlers please remember to point these variables to GCS buckets that you own, this script uses internal buckets for testing.
-# You can use the HuggingFace checkpoint at https://huggingface.co/deepseek-ai/DeepSeek-V2-Lite
-export CHKPT_BUCKET=gs://maxtext-deepseek/deepseek2-16b/hf
-export MODEL_BUCKET=gs://maxtext-deepseek/deepseek2-16b
-JAX_PLATFORMS=cpu python3 -m MaxText.convert_deepseek_ckpt --base_model_path ${CHKPT_BUCKET} --maxtext_model_path ${MODEL_BUCKET}/${idx} --model_size ${MODEL_NAME}
+# e.g., $HOME/maxtext/src/MaxText
+export MAXTEXT_PKG_DIR="${MAXTEXT_PKG_DIR:-${MAXTEXT_REPO_ROOT:-$PWD}/src/MaxText}"
 
-# Step 2:
-# Note that the `CONVERTED_CHECKPOINT` is in a `scanned` format which is great for training but for efficient decoding performance we want the checkpoint in an `unscanned` format.
-JAX_PLATFORMS=cpu python MaxText.convert_deepseek_unscanned_ckpt --base_model_path ${CHKPT_BUCKET} --maxtext_model_path ${MODEL_BUCKET}/${idx}/unscanned --model_size ${MODEL_NAME}
+if [ -z "${BASE_OUTPUT_PATH}" ]; then
+  # Non-Googlers please remember to point `BASE_OUTPUT_PATH` to GCS buckets that you own, this script uses internal buckets for testing.
+  # this bucket will store all the files generated by MaxText during a run
+  export BASE_OUTPUT_PATH=gs://runner-maxtext-logs/$(date +%Y-%m-%d-%H-%M)
+  echo "BASE_OUTPUT_PATH is not set"
+fi
+BASE_OUTPUT_PATH=${BASE_OUTPUT_PATH%/}
+echo using BASE_OUTPUT_PATH = ${BASE_OUTPUT_PATH}
 
-# Step 3:
-export UNSCANNED_CKPT_PATH=${MODEL_BUCKET}/${idx}/unscanned/0/items
-python3 -m tests.forward_pass_logit_checker "${MAXTEXT_PKG_DIR:-${MAXTEXT_REPO_ROOT:-$PWD}/src/MaxText}/"configs/base.yml tokenizer_type=huggingface tokenizer_path=deepseek-ai/DeepSeek-V2-Lite load_parameters_path=${UNSCANNED_CKPT_PATH} run_name=forward_pass_test_${MODEL_NAME} per_device_batch_size=1 model_name=${MODEL_NAME} max_prefill_predict_length=4 max_target_length=4 dataset_type=synthetic scan_layers=false sparse_matmul=False dtype=float32 activations_in_float32=true matmul_precision=high --max_kl_div=2e-4
+# Step 1: Checkpoint conversion
+# You can use the HuggingFace checkpoint at https://huggingface.co/deepseek-ai/DeepSeek-V2-Lite, and dequantize it to bf16
+# Assume HF checkpoints are uploaded to GCS bucket at CKPT_BUCKET 
+# Non-Googlers please remember to point `CKPT_BUCKET` to GCS buckets that you own
+# Copying the HF checkpoint into a local directory `/tmp` -- you are free to use a different directory
+if [ -z "${CKPT_DISK_LOCATION}" ]; then
+  export CKPT_BUCKET=gs://maxtext-deepseek/deepseek2-16b/hf
+  gcloud storage cp -r ${CKPT_BUCKET} /tmp
+  export CKPT_DISK_LOCATION=/tmp/hf
+fi
 
-# Step 4:
+# 1.1 Convert checkpoint to `scanned` format, more suitable for training 
+JAX_PLATFORMS=cpu python3 -m MaxText.utils.ckpt_scripts.convert_deepseek_family_ckpt --base_model_path ${CKPT_DISK_LOCATION} --maxtext_model_path ${BASE_OUTPUT_PATH}/scanned --model_size ${MODEL_NAME}
+
+# 1.2 Convert checkpoint to `unscanned` format, more suitable for decoding
+JAX_PLATFORMS=cpu python3 -m MaxText.utils.ckpt_scripts.convert_deepseek_family_unscanned_ckpt --base_model_path ${CKPT_DISK_LOCATION} --maxtext_model_path ${BASE_OUTPUT_PATH}/unscanned --model_size ${MODEL_NAME}
+
+# Step 2: 
+# We define the checkpoint paths. This way it is easier to use these paths in the `train.py` and `decode.py` commands
+export SCANNED_CKPT_PATH=${BASE_OUTPUT_PATH}/scanned/0/items
+export UNSCANNED_CKPT_PATH=${BASE_OUTPUT_PATH}/unscanned/0/items
 # Non-Googlers please remember to point `DATASET_PATH` to the GCS bucket where you have your training data
 export DATASET_PATH=gs://maxtext-dataset
-# Non-Googlers please remember to point `BASE_OUTPUT_DIRECTORY` to a GCS bucket that you own, this bucket will store all the files generated by MaxText during a run
-export BASE_OUTPUT_DIRECTORY=gs://runner-maxtext-logs
-# We define `CONVERTED_CHECKPOINT` to refer to the checkpoint subdirectory. This way it is easier to use this path in the `train.py` and `decode.py` commands
-export CONVERTED_CHECKPOINT=${MODEL_BUCKET}/${idx}/0/items
-
-# Run pre-training - matmul implementation
-python3 -m MaxText.train "${MAXTEXT_PKG_DIR:-${MAXTEXT_REPO_ROOT:-$PWD}/src/MaxText}/"configs/base.yml base_output_directory=${BASE_OUTPUT_DIRECTORY} run_name=matmul_pre_training per_device_batch_size=4 enable_checkpointing=false model_name=${MODEL_NAME} ici_fsdp_parallelism=4 steps=5 max_target_length=1024 async_checkpointing=false tokenizer_type=huggingface tokenizer_path=${TOKENIZER_PATH} attention=flash dtype=bfloat16 weight_dtype=bfloat16 megablox=False sparse_matmul=False dataset_type=synthetic
-# Run fine-tuning - matmul implementation
-python3 -m MaxText.train "${MAXTEXT_PKG_DIR:-${MAXTEXT_REPO_ROOT:-$PWD}/src/MaxText}/"configs/base.yml base_output_directory=${BASE_OUTPUT_DIRECTORY} dataset_path=${DATASET_PATH} load_parameters_path=${CONVERTED_CHECKPOINT} run_name=matmul_fine_tuning per_device_batch_size=4 enable_checkpointing=false model_name=${MODEL_NAME} ici_fsdp_parallelism=4 steps=5 max_target_length=1024 async_checkpointing=false tokenizer_type=huggingface tokenizer_path=${TOKENIZER_PATH} attention=flash dtype=bfloat16 weight_dtype=bfloat16 megablox=False sparse_matmul=False enable_checkpointing=true
-# Run supervised fine-tuning - matmul implementation
-python3 -m MaxText.sft_trainer "${MAXTEXT_PKG_DIR:-${MAXTEXT_REPO_ROOT:-$PWD}/src/MaxText}/"configs/sft.yml base_output_directory=${BASE_OUTPUT_DIRECTORY} load_parameters_path=${CONVERTED_CHECKPOINT} run_name=matmul_supervised_fine_tuning per_device_batch_size=4 enable_checkpointing=false model_name=${MODEL_NAME} steps=5 max_target_length=1024 async_checkpointing=false tokenizer_type=huggingface tokenizer_path=${TOKENIZER_PATH} attention=flash dtype=bfloat16 weight_dtype=bfloat16 megablox=False sparse_matmul=False enable_checkpointing=true ici_expert_parallelism=4 ici_fsdp_parallelism=1 dataset_type=hf
-# Run decoding - matmul implementation
-python3 -m MaxText.decode "${MAXTEXT_PKG_DIR:-${MAXTEXT_REPO_ROOT:-$PWD}/src/MaxText}/"configs/base.yml base_output_directory=${BASE_OUTPUT_DIRECTORY} load_parameters_path=${UNSCANNED_CKPT_PATH} run_name=decode per_device_batch_size=1 enable_checkpointing=false model_name=${MODEL_NAME} max_prefill_predict_length=100 max_target_length=1024 tokenizer_type=huggingface tokenizer_path=${TOKENIZER_PATH} attention=flash dtype=bfloat16 weight_dtype=bfloat16 megablox=False sparse_matmul=False ici_tensor_parallelism=4 ici_fsdp_parallelism=1 prompt="I love to" scan_layers=False
+
+# Test whether the forward pass logits match the golden logits
+# default golden_logits_path=/deps/src/MaxText/test_assets/golden_data_{MODEL_NAME}.jsonl, copied from gs://maxtext-test-assets/golden_data_${MODEL_NAME}.jsonl
+GOLDEN_LOGITS_DISK_LOCATION="/deps/src/MaxText/test_assets/golden_data_${MODEL_NAME}.jsonl"
+if [ ! -f "${GOLDEN_LOGITS_DISK_LOCATION}" ]; then
+  GOLDEN_LOGITS_PATH="gs://maxtext-test-assets/golden_data_${MODEL_NAME}.jsonl"
+  GOLDEN_LOGITS_DISK_LOCATION=/tmp/golden_data.jsonl
+  gcloud storage cp ${GOLDEN_LOGITS_PATH} ${GOLDEN_LOGITS_DISK_LOCATION}
+fi
+
+python3 -m tests.forward_pass_logit_checker ${MAXTEXT_PKG_DIR}/configs/base.yml base_output_directory=${BASE_OUTPUT_PATH} run_name=forward_logits_check load_parameters_path=${SCANNED_CKPT_PATH} scan_layers=true attention=dot_product per_device_batch_size=1 model_name=${MODEL_NAME} max_prefill_predict_length=4 max_target_length=4 async_checkpointing=false sparse_matmul=false ici_fsdp_parallelism=1 ici_expert_parallelism=4 checkpoint_storage_concurrent_gb=1024 weight_dtype=float32 dtype=float32 activations_in_float32=true matmul_precision=highest float32_logits=true float32_qk_product=true --golden_logits_path=${GOLDEN_LOGITS_DISK_LOCATION} --atol=1e-4 --rtol=1e-4 --max_kl_div=5e-6
+
+# Run pre-training - megablox implementation
+python3 -m MaxText.train "${MAXTEXT_PKG_DIR:-${MAXTEXT_REPO_ROOT:-$PWD}/src/MaxText}/"configs/base.yml base_output_directory=${BASE_OUTPUT_PATH} run_name=megablox_pre_training model_name=${MODEL_NAME} tokenizer_type=huggingface tokenizer_path=${TOKENIZER_PATH} dataset_type=synthetic enable_checkpointing=false attention=flash sparse_matmul=True megablox=True dtype=bfloat16 weight_dtype=bfloat16 per_device_batch_size=4 steps=5 max_target_length=1024 ici_fsdp_parallelism=4
+
+# Run fine-tuning - megablox implementation
+python3 -m MaxText.train "${MAXTEXT_PKG_DIR:-${MAXTEXT_REPO_ROOT:-$PWD}/src/MaxText}/"configs/base.yml base_output_directory=${BASE_OUTPUT_PATH} run_name=megablox_fine_tuning model_name=${MODEL_NAME} tokenizer_type=huggingface tokenizer_path=${TOKENIZER_PATH} dataset_path=${DATASET_PATH} enable_checkpointing=true async_checkpointing=false load_parameters_path=${SCANNED_CKPT_PATH} scan_layers=True attention=flash sparse_matmul=True megablox=True dtype=bfloat16 weight_dtype=bfloat16 per_device_batch_size=4 steps=5 max_target_length=1024 ici_fsdp_parallelism=1 ici_expert_parallelism=4 checkpoint_storage_concurrent_gb=1024
+
+# Run supervised fine-tuning - megablox implementation
+# python3 -m MaxText.sft_trainer "${MAXTEXT_PKG_DIR:-${MAXTEXT_REPO_ROOT:-$PWD}/src/MaxText}/"configs/sft.yml base_output_directory=${BASE_OUTPUT_PATH} run_name=megablox_supervised_fine_tuning model_name=${MODEL_NAME} tokenizer_type=huggingface tokenizer_path=${TOKENIZER_PATH} dataset_type=hf enable_checkpointing=true async_checkpointing=false load_parameters_path=${SCANNED_CKPT_PATH} scan_layers=True attention=flash sparse_matmul=True megablox=True dtype=bfloat16 weight_dtype=bfloat16 per_device_batch_size=4 steps=5 max_target_length=1024 ici_fsdp_parallelism=1 ici_expert_parallelism=4 checkpoint_storage_concurrent_gb=1024
+
+# Run decoding - megablox implementation
+# Note decode requires the access token for huggingface tokenizer even if the model is not gated
+python3 -m MaxText.decode ${MAXTEXT_PKG_DIR}/configs/base.yml base_output_directory=${BASE_OUTPUT_PATH} run_name=decode model_name=${MODEL_NAME} tokenizer_type=huggingface tokenizer_path=${TOKENIZER_PATH} hf_access_token=${HF_TOKEN} load_parameters_path=${UNSCANNED_CKPT_PATH} scan_layers=False attention=dot_product sparse_matmul=True megablox=True dtype=bfloat16 weight_dtype=bfloat16 per_device_batch_size=1 max_prefill_predict_length=512 max_target_length=1024 ici_fsdp_parallelism=1 ici_tensor_parallelism=4 ici_expert_parallelism=1 checkpoint_storage_concurrent_gb=1024 mla_naive_kvcache=false prompt="An attention function can be described as mapping a query and a set of key-value pairs to an output, where the query, keys, values, and outputs are all vectors. The output is "
@@ -0,0 +1,42 @@
+#!/bin/bash
+
+# This file is documentation for how to get started with DeepSeek v3.
+
+# This file runs Step 1 on CPU.
+# 1. Convert the HuggingFace checkpoint (bf16) to MaxText-compatible checkpoint (bf16): 
+#    Scanned format is better for training; unscanned format is better for decoding.
+# 2. Run logit check, pre-training, fine-tuning, and decoding.
+
+set -ex
+
+export MODEL_NAME='deepseek3-671b'
+export TOKENIZER_PATH='deepseek-ai/DeepSeek-V3'
+
+# Installing torch for checkpoint conversion and forward_pass_logit_checker.py
+python3 -m pip install torch --index-url https://download.pytorch.org/whl/cpu
+
+if [ -z "${BASE_OUTPUT_PATH}" ]; then
+  # Non-Googlers please remember to point `BASE_OUTPUT_PATH` to GCS buckets that you own, this script uses internal buckets for testing.
+  # this bucket will store all the files generated by MaxText during a run
+  export BASE_OUTPUT_PATH=gs://runner-maxtext-logs/$(date +%Y-%m-%d-%H-%M)
+  echo "BASE_OUTPUT_PATH is not set"
+fi
+BASE_OUTPUT_PATH=${BASE_OUTPUT_PATH%/}
+echo using BASE_OUTPUT_PATH = ${BASE_OUTPUT_PATH}
+
+# Step 1: Checkpoint conversion
+# You can use the HuggingFace checkpoint at https://huggingface.co/deepseek-ai/DeepSeek-V3, and dequantize it to bf16
+# Assume HF checkpoints are uploaded to GCS bucket at CKPT_BUCKET 
+# Non-Googlers please remember to point `CKPT_BUCKET` to GCS buckets that you own
+# Copying the HF checkpoint into a local directory `/tmp` -- you are free to use a different directory
+if [ -z "${CKPT_DISK_LOCATION}" ]; then
+  export CKPT_BUCKET=gs://maxtext-deepseek/deepseek3-671b/hf
+  gcloud storage cp -r ${CKPT_BUCKET} /tmp
+  export CKPT_DISK_LOCATION=/tmp/hf
+fi
+
+# 1.1 Convert checkpoint to `scanned` format, more suitable for training 
+JAX_PLATFORMS=cpu python3 -m MaxText.utils.ckpt_scripts.convert_deepseek_family_ckpt --base_model_path ${CKPT_DISK_LOCATION} --maxtext_model_path ${BASE_OUTPUT_PATH}/scanned --model_size ${MODEL_NAME}
+
+# 1.2 Convert checkpoint to `unscanned` format, more suitable for decoding
+JAX_PLATFORMS=cpu python3 -m MaxText.utils.ckpt_scripts.convert_deepseek_family_unscanned_ckpt --base_model_path ${CKPT_DISK_LOCATION} --maxtext_model_path ${BASE_OUTPUT_PATH}/unscanned --model_size ${MODEL_NAME}