1212# See the License for the specific language governing permissions and
1313# limitations under the License.
1414
15- # This workflow will install Python dependencies, run tests and lint with a variety of Python versions
16- # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python
15+ # This workflow builds a stable stack for JetStream+Maxtext, runs benchmarks,
16+ # cleans up resources, and sends notifications.
1717
18- name : Tests
18+ name : Run Maxtext JetStream Tests
1919
2020on :
2121 # pull_request:
2222 # push:
2323 # branches: [ "main" ]
2424 workflow_dispatch :
2525 schedule :
26- # Run the job every 4 hours
27- - cron : ' 0 */24 * * *'
26+ # Run the job daily at midnight UTC
27+ - cron : ' 0 0 * * *'
2828
2929jobs :
3030 prelim :
3131 runs-on : ["self-hosted", "tpu", "v6e-8"]
3232 steps :
33+ - name : Test gsutil installation
34+ run : which gsutil >/dev/null 2>&1 || { echo >&2 "gsutil is required but not installed. Aborting"; exit 24;}
35+ - name : Cleanup old docker images
36+ run : docker system prune --all --force
37+ - name : Authenticate gcloud
38+ run : gcloud auth configure-docker us-docker.pkg.dev --quiet
39+
40+ build_stable_stack :
41+ name : Build Stable Stack
42+ needs : prelim
43+ runs-on : ["self-hosted", "tpu", "v6e-8"]
44+ env :
45+ LOCAL_IMAGE_TAG : jetstream-maxtext-stable-stack/tpu:github_${{ github.run_id }}
46+ steps :
3347 - uses : actions/checkout@v4
48+ - name : Build
49+ run : |
50+ pushd experimental/jetstream-maxtext-stable-stack
51+ ./build.sh \
52+ LOCAL_IMAGE_TAG="${LOCAL_IMAGE_TAG}"
53+ popd
54+ - name : Test
55+ run : |
56+ pushd experimental/jetstream-maxtext-stable-stack
57+ ./test.sh \
58+ LOCAL_IMAGE_TAG=${LOCAL_IMAGE_TAG}
59+ popd
60+ - name : Upload image
61+ run : |
62+ UPLOAD_IMAGE_TAG=gcr.io/cloud-tpu-inference-test/${LOCAL_IMAGE_TAG}
63+ docker tag ${LOCAL_IMAGE_TAG} ${UPLOAD_IMAGE_TAG}
64+ docker push ${UPLOAD_IMAGE_TAG}
65+ NIGHTLY_TAG=${UPLOAD_IMAGE_TAG%:*}:nightly
66+ NIGHTLY_TAG_DATE=${NIGHTLY_TAG}-$(date +"%Y%m%d")
67+ docker tag ${LOCAL_IMAGE_TAG} ${NIGHTLY_TAG}
68+ docker tag ${LOCAL_IMAGE_TAG} ${NIGHTLY_TAG_DATE}
69+ docker push ${NIGHTLY_TAG}
70+ docker push ${NIGHTLY_TAG_DATE}
71+
72+ benchmark_report :
73+ name : Benchmark Report
74+ needs : build_stable_stack
75+ runs-on : ["self-hosted", "tpu", "v6e-8"]
76+ container :
77+ # sync with the image uploaded from build_stable_stack stage
78+ image : gcr.io/cloud-tpu-inference-test/jetstream-maxtext-stable-stack/tpu:github_${{ github.run_id }}
79+ options : " --net=host --privileged"
80+ env :
81+ OUTPUT_DIR : /workspace/test_dir/
82+ steps :
83+ - name : Create output directory # Ensure directory exists in container
84+ run : mkdir -p ${OUTPUT_DIR}
3485 - name : Test MOEBenchmarks
35- run : bash .github/workflows/test_moe_benchmarks.sh
36- # run: bash .github/workflows/test_moe_8x22b_microbenchmark.sh
37- # - name: Test MOE long context chunked prefill - 8k
38- # run: bash .github/workflows/benchmark_chunked_prefill.sh
39-
86+ # Report should generated in OUTPUT_DIR depend on ENV
87+ run : bash JetStream/.github/workflows/test_moe_benchmarks.sh
88+ - name : Upload build artifact
89+ uses : actions/upload-artifact@v4
90+ with :
91+ name : benchmark_report
92+ path : ${{ env.OUTPUT_DIR }}
93+
94+ clean_up :
95+ if : ${{ always() }} # always execute, regardless of previous jobs or steps.
96+ needs : [build_stable_stack, benchmark_report]
97+ name : " Clean up"
98+ runs-on : ["self-hosted"]
99+ permissions :
100+ contents : read
101+ issues : write # for failed-build-issue
102+ steps :
103+ - name : Delete TPU image
104+ # sync with the image uploaded from build_stable_stack stage
105+ run : gcloud container images delete gcr.io/cloud-tpu-inference-test/jetstream-maxtext-stable-stack/tpu:github_${{ github.run_id }} --force-delete-tags --quiet
106+
40107 notify :
41108 name : Notify test build # creates an issue or modifies last open existing issue for failed build
42- needs : [prelim ]
109+ needs : [build_stable_stack, benchmark_report ]
43110 runs-on : ["self-hosted", "tpu", "v6e-8"]
44111 steps :
112+ - name : Download benchmark artifact
113+ uses : actions/download-artifact@v4
114+ with :
115+ name : benchmark_report
116+ path : ./benchmark_report
45117 - name : Check whether one of the jobs failed
46118 if : ${{ failure() }}
47119 uses : jayqi/failed-build-issue-action@1a893bbf43ef1c2a8705e2b115cd4f0fe3c5649b # v1.2.0
6113362134 from : JetStream Runs
63135 secure : true
64- attachments : ~/test_dir /moe_8x7b.txt,~/test_dir /moe_8x22b.txt,~/test_dir /moe_8x22b_long_context_8k_prefill.txt,~/test_dir /moe_8x7b_jetstream.txt
136+ attachments : ./benchmark_report /moe_8x7b.txt,./benchmark_report /moe_8x22b.txt,./benchmark_report /moe_8x22b_long_context_8k_prefill.txt,./benchmark_report /moe_8x7b_jetstream.txt
65137 body : workflow for ${{github.repository}} completed successfully!
66- - name : Cleanup
67- run : rm -rf ~/test_dir
0 commit comments