Skip to content

Commit 1e1e9d4

Browse files
author
jetstream authors
committed
Merge pull request #256 from AI-Hypercomputer:yuyan-fix-test-hang
PiperOrigin-RevId: 752361458
2 parents c5e54e1 + 77735f1 commit 1e1e9d4

File tree

2 files changed

+76
-27
lines changed

2 files changed

+76
-27
lines changed

.github/workflows/run_maxtext_jetstream_tests.yaml

Lines changed: 50 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ jobs:
3535
- name: Cleanup old docker images
3636
run: docker system prune --all --force
3737
- name: Authenticate gcloud
38-
run: gcloud auth configure-docker us-docker.pkg.dev --quiet
38+
run: gcloud auth configure-docker gcr.io --quiet
3939

4040
build_stable_stack:
4141
name: Build Stable Stack
@@ -45,6 +45,8 @@ jobs:
4545
LOCAL_IMAGE_TAG: jetstream-maxtext-stable-stack/tpu:github_${{ github.run_id }}
4646
steps:
4747
- uses: actions/checkout@v4
48+
- name: Authenticate gcloud
49+
run: gcloud auth configure-docker gcr.io --quiet
4850
- name: Build
4951
run: |
5052
pushd experimental/jetstream-maxtext-stable-stack
@@ -59,61 +61,87 @@ jobs:
5961
popd
6062
- name: Upload image
6163
run: |
62-
UPLOAD_IMAGE_TAG=gcr.io/cloud-tpu-inference-test/${LOCAL_IMAGE_TAG}
64+
UPLOAD_IMAGE_TAG=gcr.io/cloud-ml-auto-solutions/${LOCAL_IMAGE_TAG}
6365
docker tag ${LOCAL_IMAGE_TAG} ${UPLOAD_IMAGE_TAG}
6466
docker push ${UPLOAD_IMAGE_TAG}
65-
NIGHTLY_TAG=${UPLOAD_IMAGE_TAG%:*}:nightly
66-
NIGHTLY_TAG_DATE=${NIGHTLY_TAG}-$(date +"%Y%m%d")
67-
docker tag ${LOCAL_IMAGE_TAG} ${NIGHTLY_TAG}
68-
docker tag ${LOCAL_IMAGE_TAG} ${NIGHTLY_TAG_DATE}
69-
docker push ${NIGHTLY_TAG}
70-
docker push ${NIGHTLY_TAG_DATE}
7167
7268
benchmark_report:
7369
name: Benchmark Report
7470
needs: build_stable_stack
7571
runs-on: ["self-hosted", "tpu", "v6e-8"]
76-
container:
77-
# sync with the image uploaded from build_stable_stack stage
78-
image: gcr.io/cloud-tpu-inference-test/jetstream-maxtext-stable-stack/tpu:github_${{ github.run_id }}
79-
options: "--net=host --privileged"
8072
env:
81-
OUTPUT_DIR: /workspace/test_dir/
73+
OUTPUT_DIR: ./test_dir
8274
steps:
83-
- name: Create output directory # Ensure directory exists in container
84-
run: mkdir -p ${OUTPUT_DIR}
8575
- name: Test MOEBenchmarks
86-
# Report should generated in OUTPUT_DIR depend on ENV
87-
run: bash JetStream/.github/workflows/test_moe_benchmarks.sh
76+
run: |
77+
rm -rf ${OUTPUT_DIR}
78+
mkdir -p ${OUTPUT_DIR}
79+
# sync with the image uploaded from build_stable_stack stage
80+
# Report should generated in OUTPUT_DIR depend on ENV
81+
DOCKER_OUTPUT_DIR=/output
82+
docker run \
83+
-v ${OUTPUT_DIR}:${DOCKER_OUTPUT_DIR} \
84+
--env OUTPUT_DIR=${DOCKER_OUTPUT_DIR} \
85+
--privileged --net=host --rm -i \
86+
gcr.io/cloud-ml-auto-solutions/jetstream-maxtext-stable-stack/tpu:github_${{ github.run_id }} \
87+
bash -c "
88+
bash JetStream/.github/workflows/test_moe_benchmarks.sh
89+
"
8890
- name: Upload build artifact
8991
uses: actions/upload-artifact@v4
9092
with:
9193
name: benchmark_report
9294
path: ${{ env.OUTPUT_DIR }}
9395

94-
clean_up:
95-
if: ${{ always() }} # always execute, regardless of previous jobs or steps.
96+
clean_up_on_fail:
97+
if: ${{ failure() }}
9698
needs: [build_stable_stack, benchmark_report]
9799
name: "Clean up"
98100
runs-on: ["self-hosted"]
99101
permissions:
100102
contents: read
101103
issues: write # for failed-build-issue
102104
steps:
105+
- name: Authenticate gcloud
106+
run: gcloud auth configure-docker gcr.io --quiet
103107
- name: Delete TPU image
104108
# sync with the image uploaded from build_stable_stack stage
105-
run: gcloud container images delete gcr.io/cloud-tpu-inference-test/jetstream-maxtext-stable-stack/tpu:github_${{ github.run_id }} --force-delete-tags --quiet
109+
run: gcloud container images delete gcr.io/cloud-ml-auto-solutions/jetstream-maxtext-stable-stack/tpu:github_${{ github.run_id }} --force-delete-tags --quiet
110+
111+
tag_night_image:
112+
needs: [build_stable_stack, benchmark_report]
113+
name: "Tag night image"
114+
runs-on: ["self-hosted"]
115+
permissions:
116+
contents: read
117+
issues: write # for failed-build-issue
118+
steps:
119+
- name: Authenticate gcloud
120+
run: gcloud auth configure-docker gcr.io --quiet
121+
- name: Upload night image
122+
# sync with the image uploaded from build_stable_stack stage
123+
run: |
124+
UPLOAD_IMAGE_TAG=gcr.io/cloud-ml-auto-solutions/jetstream-maxtext-stable-stack/tpu:github_${{ github.run_id }}
125+
NIGHTLY_TAG=${UPLOAD_IMAGE_TAG%:*}:nightly
126+
NIGHTLY_TAG_DATE=${NIGHTLY_TAG}-$(date +"%Y%m%d")
127+
gcloud container images add-tag ${UPLOAD_IMAGE_TAG} ${NIGHTLY_TAG} --quiet
128+
gcloud container images add-tag ${UPLOAD_IMAGE_TAG} ${NIGHTLY_TAG_DATE} --quiet
129+
gcloud container images untag ${UPLOAD_IMAGE_TAG} --quiet
106130
107131
notify:
108132
name: Notify test build # creates an issue or modifies last open existing issue for failed build
109133
needs: [build_stable_stack, benchmark_report]
110134
runs-on: ["self-hosted", "tpu", "v6e-8"]
135+
env:
136+
BENCHMARK_REPORT_DIR: ./benchmark_report
111137
steps:
138+
- name: Clean previous artifact
139+
run: rm -rf ${{ env.OUTPUT_DIR }}
112140
- name: Download benchmark artifact
113141
uses: actions/download-artifact@v4
114142
with:
115143
name: benchmark_report
116-
path: ./benchmark_report
144+
path: ${{ env.OUTPUT_DIR }}
117145
- name: Check whether one of the jobs failed
118146
if: ${{ failure() }}
119147
uses: jayqi/failed-build-issue-action@1a893bbf43ef1c2a8705e2b115cd4f0fe3c5649b # v1.2.0
@@ -133,5 +161,5 @@ jobs:
133161
134162
from: JetStream Runs
135163
secure: true
136-
attachments: ./benchmark_report/moe_8x7b.txt,./benchmark_report/moe_8x22b.txt,./benchmark_report/moe_8x22b_long_context_8k_prefill.txt,./benchmark_report/moe_8x7b_jetstream.txt
164+
attachments: ${{ env.OUTPUT_DIR }}/moe_8x7b.txt,${{ env.OUTPUT_DIR }}/moe_8x22b.txt,${{ env.OUTPUT_DIR }}/moe_8x22b_long_context_8k_prefill.txt,${{ env.OUTPUT_DIR }}/moe_8x7b_jetstream.txt
137165
body: workflow for ${{github.repository}} completed successfully!

experimental/jetstream-maxtext-stable-stack/test_script/benchmark_serving_example.sh

Lines changed: 26 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
1-
# TODO: need a public path
2-
export PARAM_PATH=${PARAM_PATH}
1+
#!/bin/bash
2+
3+
SERVER_PID=""
4+
CLIENT_PID=""
35

46
python -c "import nltk; nltk.download('punkt')"
57

@@ -26,8 +28,10 @@ python -m MaxText.maxengine_server \
2628
model_call_mode=inference \
2729
sparse_matmul=False \
2830
use_chunked_prefill=true \
29-
prefill_chunk_size=64 \
30-
load_parameters_path=${PARAM_PATH} &
31+
prefill_chunk_size=256 \
32+
load_parameters_path=gs://jetstream-runner/8-7B-int8 &
33+
34+
SERVER_PID=$!
3135

3236
popd
3337

@@ -41,4 +45,21 @@ python ./JetStream/benchmarks/benchmark_serving.py \
4145
--num-prompts 100 \
4246
--max-output-length 2048 \
4347
--dataset openorca \
44-
--run-eval True
48+
--run-eval True &
49+
50+
CLIENT_PID=$!
51+
52+
while true; do
53+
# If server is not running, it is crash. Terminate the script.
54+
if ! kill -0 "${SERVER_PID}" 2>/dev/null; then
55+
exit 1
56+
fi
57+
58+
# If client is done
59+
if ! kill -0 "${CLIENT_PID}" 2>/dev/null; then
60+
wait $CLIENT_PID
61+
exit $?
62+
fi
63+
64+
sleep 1
65+
done

0 commit comments

Comments
 (0)