Merge pull request #255 from AI-Hypercomputer:yuyan-stable-stack2

jetstream authors · jetstream authors · commit 5c398f640ccd · 2025-04-23T10:28:14.000-07:00
PiperOrigin-RevId: 750640829
diff --git a/.github/workflows/run_maxtext_jetstream_tests.yaml b/.github/workflows/run_maxtext_jetstream_tests.yaml
@@ -12,36 +12,108 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-# This workflow will install Python dependencies, run tests and lint with a variety of Python versions
-# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python
+# This workflow builds a stable stack for JetStream+Maxtext, runs benchmarks,
+# cleans up resources, and sends notifications.
 
-name: Tests
+name: Run Maxtext JetStream Tests
 
 on:
   # pull_request:
   # push:
   #   branches: [ "main" ]
   workflow_dispatch:
   schedule:
-    # Run the job every 4 hours
-    - cron:  '0 */24 * * *'
+    # Run the job daily at midnight UTC
+    - cron:  '0 0 * * *'
 
 jobs:
   prelim:
     runs-on: ["self-hosted", "tpu", "v6e-8"]
     steps:
+    - name: Test gsutil installation
+      run: which gsutil >/dev/null 2>&1 || { echo >&2 "gsutil is required but not installed. Aborting"; exit 24;}
+    - name: Cleanup old docker images
+      run: docker system prune --all --force
+    - name: Authenticate gcloud
+      run: gcloud auth configure-docker us-docker.pkg.dev --quiet
+
+  build_stable_stack:
+    name: Build Stable Stack
+    needs: prelim
+    runs-on: ["self-hosted", "tpu", "v6e-8"]
+    env:
+      LOCAL_IMAGE_TAG: jetstream-maxtext-stable-stack/tpu:github_${{ github.run_id }}
+    steps:
     - uses: actions/checkout@v4
+    - name: Build
+      run: |
+          pushd experimental/jetstream-maxtext-stable-stack
+          ./build.sh \
+            LOCAL_IMAGE_TAG="${LOCAL_IMAGE_TAG}"
+          popd
+    - name: Test
+      run: |
+          pushd experimental/jetstream-maxtext-stable-stack
+          ./test.sh \
+            LOCAL_IMAGE_TAG=${LOCAL_IMAGE_TAG}
+          popd
+    - name: Upload image
+      run: |
+          UPLOAD_IMAGE_TAG=gcr.io/cloud-tpu-inference-test/${LOCAL_IMAGE_TAG}
+          docker tag ${LOCAL_IMAGE_TAG} ${UPLOAD_IMAGE_TAG}
+          docker push ${UPLOAD_IMAGE_TAG}
+          NIGHTLY_TAG=${UPLOAD_IMAGE_TAG%:*}:nightly
+          NIGHTLY_TAG_DATE=${NIGHTLY_TAG}-$(date +"%Y%m%d")
+          docker tag ${LOCAL_IMAGE_TAG} ${NIGHTLY_TAG}
+          docker tag ${LOCAL_IMAGE_TAG} ${NIGHTLY_TAG_DATE}
+          docker push ${NIGHTLY_TAG}
+          docker push ${NIGHTLY_TAG_DATE}
+
+  benchmark_report:
+    name: Benchmark Report
+    needs: build_stable_stack
+    runs-on: ["self-hosted", "tpu", "v6e-8"]
+    container:
+      # sync with the image uploaded from build_stable_stack stage
+      image: gcr.io/cloud-tpu-inference-test/jetstream-maxtext-stable-stack/tpu:github_${{ github.run_id }}
+      options: "--net=host --privileged"
+    env:
+      OUTPUT_DIR: /workspace/test_dir/
+    steps:
+    - name: Create output directory # Ensure directory exists in container
+      run: mkdir -p ${OUTPUT_DIR}
     - name: Test MOEBenchmarks
-      run: bash .github/workflows/test_moe_benchmarks.sh
-      # run: bash .github/workflows/test_moe_8x22b_microbenchmark.sh
-    # - name: Test MOE long context chunked prefill - 8k
-    #   run: bash .github/workflows/benchmark_chunked_prefill.sh
-  
+      # Report should generated in OUTPUT_DIR depend on ENV
+      run: bash JetStream/.github/workflows/test_moe_benchmarks.sh
+    - name: Upload build artifact
+      uses: actions/upload-artifact@v4
+      with:
+        name: benchmark_report
+        path: ${{ env.OUTPUT_DIR }}
+
+  clean_up:
+    if: ${{ always() }}  # always execute, regardless of previous jobs or steps.
+    needs: [build_stable_stack, benchmark_report]
+    name: "Clean up"
+    runs-on: ["self-hosted"]
+    permissions:
+      contents: read
+      issues: write  # for failed-build-issue
+    steps:
+    - name: Delete TPU image
+      # sync with the image uploaded from build_stable_stack stage
+      run: gcloud container images delete gcr.io/cloud-tpu-inference-test/jetstream-maxtext-stable-stack/tpu:github_${{ github.run_id }} --force-delete-tags --quiet
+
   notify:
     name: Notify test build # creates an issue or modifies last open existing issue for failed build
-    needs: [prelim]
+    needs: [build_stable_stack, benchmark_report]
     runs-on: ["self-hosted", "tpu", "v6e-8"]
     steps:
+    - name: Download benchmark artifact
+      uses: actions/download-artifact@v4
+      with:
+        name: benchmark_report
+        path: ./benchmark_report
     - name: Check whether one of the jobs failed
       if: ${{ failure() }}
       uses: jayqi/failed-build-issue-action@1a893bbf43ef1c2a8705e2b115cd4f0fe3c5649b  # v1.2.0
@@ -61,7 +133,5 @@ jobs:
         to: singhvijaya@google.com, yuyanpeng@google.com, vipannalla@google.com
         from: JetStream Runs
         secure: true
-        attachments: ~/test_dir/moe_8x7b.txt,~/test_dir/moe_8x22b.txt,~/test_dir/moe_8x22b_long_context_8k_prefill.txt,~/test_dir/moe_8x7b_jetstream.txt
+        attachments: ./benchmark_report/moe_8x7b.txt,./benchmark_report/moe_8x22b.txt,./benchmark_report/moe_8x22b_long_context_8k_prefill.txt,./benchmark_report/moe_8x7b_jetstream.txt
         body: workflow for ${{github.repository}} completed successfully!
-    - name: Cleanup
-      run: rm -rf ~/test_dir
diff --git a/.github/workflows/test_moe_benchmarks.sh b/.github/workflows/test_moe_benchmarks.sh
@@ -1,42 +1,23 @@
 #!/bin/bash
-mkdir ~/test_dir
-cd ~/test_dir
-git clone https://github.com/google/maxtext.git
-
-cd ~/test_dir
-git clone https://github.com/google/JetStream.git
-cd ~/test_dir
-sudo apt-get -y update
-sudo apt-get -y install python3.10-venv
-sudo apt-get -y install jq
-python -m venv .env
-source .env/bin/activate
-
-cd ~/test_dir
-cd JetStream
-pip install -e .
-cd benchmarks
-pip install -r requirements.in
-
-cd ~/test_dir
-cd maxtext/
-pip3 install wheel
-bash setup.sh MODE=stable DEVICE=tpu
+
+OUTPUT_DIR=${OUTPUT_DIR:-$(pwd)/test_dir}
 
 pip install nltk==3.8.1
+python -c "import nltk; nltk.download('punkt')"
 
+cd maxtext
 
 # moe 8x7b microbenchmark
-LIBTPU_INIT_ARGS="--xla_tpu_enable_windowed_einsum_for_reduce_scatter=false --xla_jf_spmd_threshold_for_windowed_einsum_mib=1000000" python -m MaxText.inference_microbenchmark MaxText/configs/inference.yml tokenizer_path=assets/tokenizer.mistral-v1 max_prefill_predict_length=1024 max_target_length=2048 model_name=mixtral-8x7b ici_fsdp_parallelism=1 ici_autoregressive_parallelism=1 ici_tensor_parallelism=1 ici_context_autoregressive_parallelism=8 scan_layers=false weight_dtype=bfloat16 per_device_batch_size=8 megablox=False quantization=int8 quantize_kvcache=False checkpoint_is_quantized=True load_parameters_path=gs://jetstream-runner/8-7B-int8 capacity_factor=1 attention=dot_product model_call_mode=inference sparse_matmul=False weight_dtype=bfloat16 > ~/test_dir/moe_8x7b.txt
-tail -n5 ~/test_dir/moe_8x7b.txt > ~/test_dir/moe_8x7b.tmp && mv ~/test_dir/moe_8x7b.tmp ~/test_dir/moe_8x7b.txt
+LIBTPU_INIT_ARGS="--xla_tpu_enable_windowed_einsum_for_reduce_scatter=false --xla_jf_spmd_threshold_for_windowed_einsum_mib=1000000" python -m MaxText.inference_microbenchmark MaxText/configs/inference.yml tokenizer_path=assets/tokenizer.mistral-v1 max_prefill_predict_length=1024 max_target_length=2048 model_name=mixtral-8x7b ici_fsdp_parallelism=1 ici_autoregressive_parallelism=1 ici_tensor_parallelism=1 ici_context_autoregressive_parallelism=8 scan_layers=false weight_dtype=bfloat16 per_device_batch_size=8 megablox=False quantization=int8 quantize_kvcache=False checkpoint_is_quantized=True load_parameters_path=gs://jetstream-runner/8-7B-int8 capacity_factor=1 attention=dot_product model_call_mode=inference sparse_matmul=False weight_dtype=bfloat16 > ${OUTPUT_DIR}/moe_8x7b.txt
+tail -n5 ${OUTPUT_DIR}/moe_8x7b.txt > ${OUTPUT_DIR}/moe_8x7b.tmp && mv ${OUTPUT_DIR}/moe_8x7b.tmp ${OUTPUT_DIR}/moe_8x7b.txt
 
 # moe 8x22B microbenchmark
-LIBTPU_INIT_ARGS="--xla_tpu_enable_windowed_einsum_for_reduce_scatter=false --xla_jf_spmd_threshold_for_windowed_einsum_mib=1000000" python -m MaxText.inference_microbenchmark MaxText/configs/inference.yml load_parameters_path=gs://jetstream-runner/8-22B-int8  max_prefill_predict_length=1024 max_target_length=2048 model_name=mixtral-8x22b ici_fsdp_parallelism=1 ici_autoregressive_parallelism=1 ici_tensor_parallelism=1 ici_context_autoregressive_parallelism=8 scan_layers=false per_device_batch_size=24 attention=dot_product megablox=False quantization=int8 checkpoint_is_quantized=True quantize_kvcache=True capacity_factor=1 tokenizer_path=assets/tokenizer.mistral-v3 inference_microbenchmark_prefill_lengths="128,1024" sparse_matmul=False model_call_mode=inference > ~/test_dir/moe_8x22b.txt
-tail -n5 ~/test_dir/moe_8x22b.txt > ~/test_dir/moe_8x22b.tmp && mv ~/test_dir/moe_8x22b.tmp ~/test_dir/moe_8x22b.txt
+LIBTPU_INIT_ARGS="--xla_tpu_enable_windowed_einsum_for_reduce_scatter=false --xla_jf_spmd_threshold_for_windowed_einsum_mib=1000000" python -m MaxText.inference_microbenchmark MaxText/configs/inference.yml load_parameters_path=gs://jetstream-runner/8-22B-int8  max_prefill_predict_length=1024 max_target_length=2048 model_name=mixtral-8x22b ici_fsdp_parallelism=1 ici_autoregressive_parallelism=1 ici_tensor_parallelism=1 ici_context_autoregressive_parallelism=8 scan_layers=false per_device_batch_size=24 attention=dot_product megablox=False quantization=int8 checkpoint_is_quantized=True quantize_kvcache=True capacity_factor=1 tokenizer_path=assets/tokenizer.mistral-v3 inference_microbenchmark_prefill_lengths="128,1024" sparse_matmul=False model_call_mode=inference > ${OUTPUT_DIR}/moe_8x22b.txt
+tail -n5 ${OUTPUT_DIR}/moe_8x22b.txt > ${OUTPUT_DIR}/moe_8x22b.tmp && mv ${OUTPUT_DIR}/moe_8x22b.tmp ${OUTPUT_DIR}/moe_8x22b.txt
 
 # moe 8x22B 8k context length chunked prefill with 2k prefill chunk size
-LIBTPU_INIT_ARGS="--xla_tpu_enable_windowed_einsum_for_reduce_scatter=false --xla_jf_spmd_threshold_for_windowed_einsum_mib=1000000" python -m MaxText.benchmark_chunked_prefill MaxText/configs/inference.yml load_parameters_path=gs://jetstream-runner/8-22B-int8  max_prefill_predict_length=8192 max_target_length=9000 model_name=mixtral-8x22b ici_fsdp_parallelism=1 ici_autoregressive_parallelism=1 ici_tensor_parallelism=1 scan_layers=false per_device_batch_size=24 attention=dot_product megablox=False quantization=int8 checkpoint_is_quantized=True quantize_kvcache=False capacity_factor=1 tokenizer_path=assets/tokenizer.mistral-v3 inference_microbenchmark_prefill_lengths="8192" sparse_matmul=False model_call_mode=inference ici_context_autoregressive_parallelism=8 use_chunked_prefill=True prefill_chunk_size=2048 > ~/test_dir/moe_8x22b_long_context_8k_prefill.txt
-tail -n5 ~/test_dir/moe_8x22b_long_context_8k_prefill.txt > ~/test_dir/moe_8x22b_long_context_8k_prefill.tmp && mv ~/test_dir/moe_8x22b_long_context_8k_prefill.tmp ~/test_dir/moe_8x22b_long_context_8k_prefill.txt
+LIBTPU_INIT_ARGS="--xla_tpu_enable_windowed_einsum_for_reduce_scatter=false --xla_jf_spmd_threshold_for_windowed_einsum_mib=1000000" python -m MaxText.benchmark_chunked_prefill MaxText/configs/inference.yml load_parameters_path=gs://jetstream-runner/8-22B-int8  max_prefill_predict_length=8192 max_target_length=9000 model_name=mixtral-8x22b ici_fsdp_parallelism=1 ici_autoregressive_parallelism=1 ici_tensor_parallelism=1 scan_layers=false per_device_batch_size=24 attention=dot_product megablox=False quantization=int8 checkpoint_is_quantized=True quantize_kvcache=False capacity_factor=1 tokenizer_path=assets/tokenizer.mistral-v3 inference_microbenchmark_prefill_lengths="8192" sparse_matmul=False model_call_mode=inference ici_context_autoregressive_parallelism=8 use_chunked_prefill=True prefill_chunk_size=2048 > ${OUTPUT_DIR}/moe_8x22b_long_context_8k_prefill.txt
+tail -n5 ${OUTPUT_DIR}/moe_8x22b_long_context_8k_prefill.txt > ${OUTPUT_DIR}/moe_8x22b_long_context_8k_prefill.tmp && mv ${OUTPUT_DIR}/moe_8x22b_long_context_8k_prefill.tmp ${OUTPUT_DIR}/moe_8x22b_long_context_8k_prefill.txt
 
 
 # moe 8x7B Maxtext Jetstream 
@@ -47,13 +28,8 @@ sleep 600
 
 cd ..
 
-# copy openorca datset 
-gsutil cp gs://jetstream-runner/datasets/open_orca_gpt4_tokenized_llama.calibration_1000.pkl JetStream/benchmarks/
-
-python -c "import nltk; nltk.download('punkt')"
-
-python JetStream/benchmarks/benchmark_serving.py   --tokenizer ~/test_dir/maxtext/assets/tokenizer.mistral-v1 --save-result   --save-request-outputs   --request-outputs-file-path outputs.json   --num-prompts 1200   --max-output-length 1024  --dataset openorca --run-eval True > ~/test_dir/moe_8x7b_jetstream.txt
-tail -n25 ~/test_dir/moe_8x7b_jetstream.txt > ~/test_dir/moe_8x7b_jetstream.tmp && mv ~/test_dir/moe_8x7b_jetstream.tmp ~/test_dir/moe_8x7b_jetstream.txt
+python JetStream/benchmarks/benchmark_serving.py   --tokenizer maxtext/assets/tokenizer.mistral-v1 --save-result   --save-request-outputs   --request-outputs-file-path outputs.json   --num-prompts 1200   --max-output-length 1024  --dataset openorca --run-eval True > ${OUTPUT_DIR}/moe_8x7b_jetstream.txt
+tail -n25 ${OUTPUT_DIR}/moe_8x7b_jetstream.txt > ${OUTPUT_DIR}/moe_8x7b_jetstream.tmp && mv ${OUTPUT_DIR}/moe_8x7b_jetstream.tmp ${OUTPUT_DIR}/moe_8x7b_jetstream.txt
 
 # kill Jetstream server
 kill -9 %%