@@ -9,11 +9,25 @@ cd maxtext
99
1010# moe 8x7b microbenchmark
1111LIBTPU_INIT_ARGS=" --xla_tpu_enable_windowed_einsum_for_reduce_scatter=false --xla_jf_spmd_threshold_for_windowed_einsum_mib=1000000" python -m MaxText.inference_microbenchmark MaxText/configs/inference.yml tokenizer_path=assets/tokenizer.mistral-v1 max_prefill_predict_length=1024 max_target_length=2048 model_name=mixtral-8x7b ici_fsdp_parallelism=1 ici_autoregressive_parallelism=1 ici_tensor_parallelism=1 ici_context_autoregressive_parallelism=8 scan_layers=false weight_dtype=bfloat16 per_device_batch_size=8 megablox=False quantization=int8 quantize_kvcache=False checkpoint_is_quantized=True load_parameters_path=gs://jetstream-runner/8-7B-int8 capacity_factor=1 attention=dot_product model_call_mode=inference sparse_matmul=False weight_dtype=bfloat16 > ${OUTPUT_DIR} /moe_8x7b.txt
12- tail -n5 ${OUTPUT_DIR} /moe_8x7b.txt > ${OUTPUT_DIR} /moe_8x7b.tmp && mv ${OUTPUT_DIR} /moe_8x7b.tmp ${OUTPUT_DIR} /moe_8x7b.txt
12+ tail -n5 ${OUTPUT_DIR} /moe_8x7b.txt > ${OUTPUT_DIR} /moe_8x7b.tmp
13+ echo " \n" >> ${OUTPUT_DIR} /result_comparison.txt
14+ echo " \n8x7b microbenchmark prefill decode latencies" >> ${OUTPUT_DIR} /result_comparison.txt
15+ grep " \nPREFILL" ${OUTPUT_DIR} /moe_8x7b.tmp >> ${OUTPUT_DIR} /result_comparison.txt
16+ grep " \nDECODE" ${OUTPUT_DIR} /moe_8x7b.tmp >> ${OUTPUT_DIR} /result_comparison.txt
17+ mv ${OUTPUT_DIR} /moe_8x7b.tmp ${OUTPUT_DIR} /moe_8x7b.txt
18+ # tail -n5 ${OUTPUT_DIR}/moe_8x7b.txt > ${OUTPUT_DIR}/moe_8x7b.tmp && mv ${OUTPUT_DIR}/moe_8x7b.tmp ${OUTPUT_DIR}/moe_8x7b.txt
1319
1420# moe 8x22B microbenchmark
1521LIBTPU_INIT_ARGS=" --xla_tpu_enable_windowed_einsum_for_reduce_scatter=false --xla_jf_spmd_threshold_for_windowed_einsum_mib=1000000" python -m MaxText.inference_microbenchmark MaxText/configs/inference.yml load_parameters_path=gs://jetstream-runner/8-22B-int8 max_prefill_predict_length=1024 max_target_length=2048 model_name=mixtral-8x22b ici_fsdp_parallelism=1 ici_autoregressive_parallelism=1 ici_tensor_parallelism=1 ici_context_autoregressive_parallelism=8 scan_layers=false per_device_batch_size=24 attention=dot_product megablox=False quantization=int8 checkpoint_is_quantized=True quantize_kvcache=True capacity_factor=1 tokenizer_path=assets/tokenizer.mistral-v3 inference_microbenchmark_prefill_lengths=" 128,1024" sparse_matmul=False model_call_mode=inference > ${OUTPUT_DIR} /moe_8x22b.txt
16- tail -n5 ${OUTPUT_DIR} /moe_8x22b.txt > ${OUTPUT_DIR} /moe_8x22b.tmp && mv ${OUTPUT_DIR} /moe_8x22b.tmp ${OUTPUT_DIR} /moe_8x22b.txt
22+
23+ tail -n5 ${OUTPUT_DIR} /moe_8x22b.txt > ${OUTPUT_DIR} /moe_8x22b.tmp
24+ echo " \n" >> ${OUTPUT_DIR} /result_comparison.txt
25+ echo " \n8x22b microbenchmark prefill decode latencies" >> ${OUTPUT_DIR} /result_comparison.tmp
26+ grep " \nPREFILL" ${OUTPUT_DIR} /moe_8x22b.tmp >> ${OUTPUT_DIR} /result_comparison.txt
27+ grep " \nDECODE" ${OUTPUT_DIR} /moe_8x22b.tmp >> ${OUTPUT_DIR} /result_comparison.txt
28+ mv ${OUTPUT_DIR} /moe_8x22b.tmp ${OUTPUT_DIR} /moe_8x22b.txt
29+
30+ # tail -n5 ${OUTPUT_DIR}/moe_8x22b.txt > ${OUTPUT_DIR}/moe_8x22b.tmp && mv ${OUTPUT_DIR}/moe_8x22b.tmp ${OUTPUT_DIR}/moe_8x22b.txt
1731
1832# moe 8x22B 8k context length chunked prefill with 2k prefill chunk size
1933LIBTPU_INIT_ARGS=" --xla_tpu_enable_windowed_einsum_for_reduce_scatter=false --xla_jf_spmd_threshold_for_windowed_einsum_mib=1000000" python -m MaxText.benchmark_chunked_prefill MaxText/configs/inference.yml load_parameters_path=gs://jetstream-runner/8-22B-int8 max_prefill_predict_length=8192 max_target_length=9000 model_name=mixtral-8x22b ici_fsdp_parallelism=1 ici_autoregressive_parallelism=1 ici_tensor_parallelism=1 scan_layers=false per_device_batch_size=24 attention=dot_product megablox=False quantization=int8 checkpoint_is_quantized=True quantize_kvcache=False capacity_factor=1 tokenizer_path=assets/tokenizer.mistral-v3 inference_microbenchmark_prefill_lengths=" 8192" sparse_matmul=False model_call_mode=inference ici_context_autoregressive_parallelism=8 use_chunked_prefill=True prefill_chunk_size=2048 > ${OUTPUT_DIR} /moe_8x22b_long_context_8k_prefill.txt
@@ -29,7 +43,20 @@ sleep 600
2943cd ..
3044
3145python JetStream/benchmarks/benchmark_serving.py --tokenizer maxtext/assets/tokenizer.mistral-v1 --save-result --save-request-outputs --request-outputs-file-path outputs.json --num-prompts 1200 --max-output-length 1024 --dataset openorca --run-eval True > ${OUTPUT_DIR} /moe_8x7b_jetstream.txt
32- tail -n25 ${OUTPUT_DIR} /moe_8x7b_jetstream.txt > ${OUTPUT_DIR} /moe_8x7b_jetstream.tmp && mv ${OUTPUT_DIR} /moe_8x7b_jetstream.tmp ${OUTPUT_DIR} /moe_8x7b_jetstream.txt
46+ # tail -n25 ${OUTPUT_DIR}/moe_8x7b_jetstream.txt > ${OUTPUT_DIR}/moe_8x7b_jetstream.tmp && mv ${OUTPUT_DIR}/moe_8x7b_jetstream.tmp ${OUTPUT_DIR}/moe_8x7b_jetstream.txt
3347
3448# kill Jetstream server
3549kill -9 %%
50+
51+ tail -n25 ${OUTPUT_DIR} /moe_8x7b_jetstream.txt > ${OUTPUT_DIR} /moe_8x7b_jetstream.tmp
52+ echo " \n" >> ${OUTPUT_DIR} /result_comparison.txt
53+
54+ echo " \n8x7b Maxtext Jetstream Run throughput and accuracy for Mixtral 8x7B" >> ${OUTPUT_DIR} /result_comparison.txt
55+ grep " \nthroughput" ${OUTPUT_DIR} /moe_8x7b_jetstream.tmp >> ${OUTPUT_DIR} /result_comparison.txt
56+ grep " \nrouge1" ${OUTPUT_DIR} /moe_8x7b_jetstream.tmp >> ${OUTPUT_DIR} /result_comparison.txt
57+
58+ mv ${OUTPUT_DIR} /moe_8x7b_jetstream.tmp ${OUTPUT_DIR} /moe_8x7b_jetstream.txt
59+
60+
61+ # download golden numbers from gcs
62+ gsutil cp gs://jetstream-inference-stable-stack-artifacts/golden-numbers/golden-numbers.txt ${OUTPUT_DIR} /
0 commit comments