3535 - name : Cleanup old docker images
3636 run : docker system prune --all --force
3737 - name : Authenticate gcloud
38- run : gcloud auth configure-docker us-docker.pkg.dev --quiet
38+ run : gcloud auth configure-docker gcr.io --quiet
3939
4040 build_stable_stack :
4141 name : Build Stable Stack
4545 LOCAL_IMAGE_TAG : jetstream-maxtext-stable-stack/tpu:github_${{ github.run_id }}
4646 steps :
4747 - uses : actions/checkout@v4
48+ - name : Authenticate gcloud
49+ run : gcloud auth configure-docker gcr.io --quiet
4850 - name : Build
4951 run : |
5052 pushd experimental/jetstream-maxtext-stable-stack
@@ -59,61 +61,87 @@ jobs:
5961 popd
6062 - name : Upload image
6163 run : |
62- UPLOAD_IMAGE_TAG=gcr.io/cloud-tpu-inference-test /${LOCAL_IMAGE_TAG}
64+ UPLOAD_IMAGE_TAG=gcr.io/cloud-ml-auto-solutions /${LOCAL_IMAGE_TAG}
6365 docker tag ${LOCAL_IMAGE_TAG} ${UPLOAD_IMAGE_TAG}
6466 docker push ${UPLOAD_IMAGE_TAG}
65- NIGHTLY_TAG=${UPLOAD_IMAGE_TAG%:*}:nightly
66- NIGHTLY_TAG_DATE=${NIGHTLY_TAG}-$(date +"%Y%m%d")
67- docker tag ${LOCAL_IMAGE_TAG} ${NIGHTLY_TAG}
68- docker tag ${LOCAL_IMAGE_TAG} ${NIGHTLY_TAG_DATE}
69- docker push ${NIGHTLY_TAG}
70- docker push ${NIGHTLY_TAG_DATE}
7167
7268 benchmark_report :
7369 name : Benchmark Report
7470 needs : build_stable_stack
7571 runs-on : ["self-hosted", "tpu", "v6e-8"]
76- container :
77- # sync with the image uploaded from build_stable_stack stage
78- image : gcr.io/cloud-tpu-inference-test/jetstream-maxtext-stable-stack/tpu:github_${{ github.run_id }}
79- options : " --net=host --privileged"
8072 env :
81- OUTPUT_DIR : /workspace/ test_dir/
73+ OUTPUT_DIR : ./ test_dir
8274 steps :
83- - name : Create output directory # Ensure directory exists in container
84- run : mkdir -p ${OUTPUT_DIR}
8575 - name : Test MOEBenchmarks
86- # Report should generated in OUTPUT_DIR depend on ENV
87- run : bash JetStream/.github/workflows/test_moe_benchmarks.sh
76+ run : |
77+ rm -rf ${OUTPUT_DIR}
78+ mkdir -p ${OUTPUT_DIR}
79+ # sync with the image uploaded from build_stable_stack stage
80+ # Report should generated in OUTPUT_DIR depend on ENV
81+ DOCKER_OUTPUT_DIR=/output
82+ docker run \
83+ -v ${OUTPUT_DIR}:${DOCKER_OUTPUT_DIR} \
84+ --env OUTPUT_DIR=${DOCKER_OUTPUT_DIR} \
85+ --privileged --net=host --rm -i \
86+ gcr.io/cloud-ml-auto-solutions/jetstream-maxtext-stable-stack/tpu:github_${{ github.run_id }} \
87+ bash -c "
88+ bash JetStream/.github/workflows/test_moe_benchmarks.sh
89+ "
8890 - name : Upload build artifact
8991 uses : actions/upload-artifact@v4
9092 with :
9193 name : benchmark_report
9294 path : ${{ env.OUTPUT_DIR }}
9395
94- clean_up :
95- if : ${{ always () }} # always execute, regardless of previous jobs or steps.
96+ clean_up_on_fail :
97+ if : ${{ failure () }}
9698 needs : [build_stable_stack, benchmark_report]
9799 name : " Clean up"
98100 runs-on : ["self-hosted"]
99101 permissions :
100102 contents : read
101103 issues : write # for failed-build-issue
102104 steps :
105+ - name : Authenticate gcloud
106+ run : gcloud auth configure-docker gcr.io --quiet
103107 - name : Delete TPU image
104108 # sync with the image uploaded from build_stable_stack stage
105- run : gcloud container images delete gcr.io/cloud-tpu-inference-test/jetstream-maxtext-stable-stack/tpu:github_${{ github.run_id }} --force-delete-tags --quiet
109+ run : gcloud container images delete gcr.io/cloud-ml-auto-solutions/jetstream-maxtext-stable-stack/tpu:github_${{ github.run_id }} --force-delete-tags --quiet
110+
111+ tag_night_image :
112+ needs : [build_stable_stack, benchmark_report]
113+ name : " Tag night image"
114+ runs-on : ["self-hosted"]
115+ permissions :
116+ contents : read
117+ issues : write # for failed-build-issue
118+ steps :
119+ - name : Authenticate gcloud
120+ run : gcloud auth configure-docker gcr.io --quiet
121+ - name : Upload night image
122+ # sync with the image uploaded from build_stable_stack stage
123+ run : |
124+ UPLOAD_IMAGE_TAG=gcr.io/cloud-ml-auto-solutions/jetstream-maxtext-stable-stack/tpu:github_${{ github.run_id }}
125+ NIGHTLY_TAG=${UPLOAD_IMAGE_TAG%:*}:nightly
126+ NIGHTLY_TAG_DATE=${NIGHTLY_TAG}-$(date +"%Y%m%d")
127+ gcloud container images add-tag ${UPLOAD_IMAGE_TAG} ${NIGHTLY_TAG} --quiet
128+ gcloud container images add-tag ${UPLOAD_IMAGE_TAG} ${NIGHTLY_TAG_DATE} --quiet
129+ gcloud container images untag ${UPLOAD_IMAGE_TAG} --quiet
106130
107131 notify :
108132 name : Notify test build # creates an issue or modifies last open existing issue for failed build
109133 needs : [build_stable_stack, benchmark_report]
110134 runs-on : ["self-hosted", "tpu", "v6e-8"]
135+ env :
136+ BENCHMARK_REPORT_DIR : ./benchmark_report
111137 steps :
138+ - name : Clean previous artifact
139+ run : rm -rf ${{ env.OUTPUT_DIR }}
112140 - name : Download benchmark artifact
113141 uses : actions/download-artifact@v4
114142 with :
115143 name : benchmark_report
116- path : ./benchmark_report
144+ path : ${{ env.OUTPUT_DIR }}
117145 - name : Check whether one of the jobs failed
118146 if : ${{ failure() }}
119147 uses : jayqi/failed-build-issue-action@1a893bbf43ef1c2a8705e2b115cd4f0fe3c5649b # v1.2.0
@@ -133,5 +161,5 @@ jobs:
133161134162 from : JetStream Runs
135163 secure : true
136- attachments : ./benchmark_report/ moe_8x7b.txt,./benchmark_report/ moe_8x22b.txt,./benchmark_report/ moe_8x22b_long_context_8k_prefill.txt,./benchmark_report /moe_8x7b_jetstream.txt
164+ attachments : ${{ env.OUTPUT_DIR }}/ moe_8x7b.txt,${{ env.OUTPUT_DIR }}/ moe_8x22b.txt,${{ env.OUTPUT_DIR }}/ moe_8x22b_long_context_8k_prefill.txt,${{ env.OUTPUT_DIR }} /moe_8x7b_jetstream.txt
137165 body : workflow for ${{github.repository}} completed successfully!
0 commit comments