diff --git a/backends/qualcomm/_passes/canonicalize_conv.py b/backends/qualcomm/_passes/canonicalize_conv.py
index 8836ed44328..08d34ced8ba 100644
--- a/backends/qualcomm/_passes/canonicalize_conv.py
+++ b/backends/qualcomm/_passes/canonicalize_conv.py
@@ -115,86 +115,98 @@ def call(self, graph_module: torch.fx.GraphModule):
                     )
 
                     with graph_module.graph.inserting_after(qdq_node_after_unsqueeze):
-                        filter_arg = node.args[1]
-                        filter_node = (
-                            filter_arg
-                            if filter_arg.op == "placeholder"
-                            else node.args[1].args[0]
-                        )
-                        filter_node.meta["val"] = (
-                            filter_node.meta["val"].unsqueeze(2).contiguous()
-                        )
-                        filter_tensor = get_parameter(
-                            filter_node, self.edge_program
-                        ).unsqueeze(2)
-                        set_parameter(
-                            (
-                                torch.nn.Parameter(filter_tensor)
-                                if filter_tensor.dtype == torch.float
-                                else filter_tensor
-                            ),
-                            filter_node,
-                            self.edge_program,
-                        )
-
-                        num_args = len(node.args)
-
-                        bias_node = node.args[2] if num_args > 2 else None
-                        stride = [1] + node.args[3] if num_args > 3 else [1, 1]
-                        padding = [0] + node.args[4] if num_args > 4 else [0, 0]
-                        if node.target == torch.ops.aten.conv1d.default:
-                            dilation = [1] + node.args[5] if num_args > 5 else [1, 1]
-                            groups = node.args[6] if num_args > 6 else 1
-                            conv_args = (
-                                qdq_node_after_unsqueeze,
-                                node.args[1],
-                                bias_node,
-                                stride,
-                                padding,
-                                dilation,
-                                groups,
+                        # conv2d must be inserted before conv1d in the graph to preserve correct
+                        # topological ordering. This is required due to conv-bn fusion: when conv1d
+                        # has no bias, the fused bias (from batchnorm) is introduced as a new node,
+                        # and its corresponding dq (dequantize) node must appear before conv2d in
+                        # the execution order.
+                        with graph_module.graph.inserting_before(node):
+                            filter_arg = node.args[1]
+                            filter_node = (
+                                filter_arg
+                                if filter_arg.op == "placeholder"
+                                else node.args[1].args[0]
                             )
-                        else:
-                            output_padding = (
-                                [0] + node.args[5] if num_args > 5 else [0, 0]
+                            filter_node.meta["val"] = filter_node.meta["val"].unsqueeze(
+                                2
                             )
-                            groups = node.args[6] if num_args > 6 else 1
-                            dilation = [1] + node.args[7] if num_args > 7 else [1, 1]
-                            conv_args = (
-                                qdq_node_after_unsqueeze,
-                                node.args[1],
-                                bias_node,
-                                stride,
-                                padding,
-                                output_padding,
-                                groups,
-                                dilation,
-                            )
-                        conv2d_node = graph.create_node(
-                            "call_function",
-                            self.conv1d_op_map[node.target],
-                            conv_args,
-                        )
-                        conv2d_node.meta = copy_meta(
-                            node.meta, lambda m: {**m, "val": m["val"].unsqueeze(2)}
-                        )
-                        qdq_node_after_conv2d = append_qdq(
-                            graph_module=graph_module,
-                            node=conv2d_node,
-                            qdq_node=list(node.users)[0],
-                        )
-
-                        with graph_module.graph.inserting_after(qdq_node_after_conv2d):
-                            squeeze_op = torch.ops.aten.squeeze_copy.dims
-                            squeeze_node = graph.create_node(
-                                "call_function",
-                                squeeze_op,
+                            filter_tensor = get_parameter(
+                                filter_node, self.edge_program
+                            ).unsqueeze(2)
+                            set_parameter(
                                 (
-                                    qdq_node_after_conv2d,
-                                    [2],
+                                    torch.nn.Parameter(filter_tensor)
+                                    if filter_tensor.dtype == torch.float
+                                    else filter_tensor
                                 ),
+                                filter_node,
+                                self.edge_program,
+                            )
+
+                            num_args = len(node.args)
+
+                            bias_node = node.args[2] if num_args > 2 else None
+                            stride = [1] + node.args[3] if num_args > 3 else [1, 1]
+                            padding = [0] + node.args[4] if num_args > 4 else [0, 0]
+                            if node.target == torch.ops.aten.conv1d.default:
+                                dilation = (
+                                    [1] + node.args[5] if num_args > 5 else [1, 1]
+                                )
+                                groups = node.args[6] if num_args > 6 else 1
+                                conv_args = (
+                                    qdq_node_after_unsqueeze,
+                                    node.args[1],
+                                    bias_node,
+                                    stride,
+                                    padding,
+                                    dilation,
+                                    groups,
+                                )
+                            else:
+                                output_padding = (
+                                    [0] + node.args[5] if num_args > 5 else [0, 0]
+                                )
+                                groups = node.args[6] if num_args > 6 else 1
+                                dilation = (
+                                    [1] + node.args[7] if num_args > 7 else [1, 1]
+                                )
+                                conv_args = (
+                                    qdq_node_after_unsqueeze,
+                                    node.args[1],
+                                    bias_node,
+                                    stride,
+                                    padding,
+                                    output_padding,
+                                    groups,
+                                    dilation,
+                                )
+                            conv2d_node = graph.create_node(
+                                "call_function",
+                                self.conv1d_op_map[node.target],
+                                conv_args,
+                            )
+                            conv2d_node.meta = copy_meta(
+                                node.meta, lambda m: {**m, "val": m["val"].unsqueeze(2)}
                             )
-                            squeeze_node.meta = copy_meta(node.meta)
+                            qdq_node_after_conv2d = append_qdq(
+                                graph_module=graph_module,
+                                node=conv2d_node,
+                                qdq_node=list(node.users)[0],
+                            )
+
+                            with graph_module.graph.inserting_after(
+                                qdq_node_after_conv2d
+                            ):
+                                squeeze_op = torch.ops.aten.squeeze_copy.dims
+                                squeeze_node = graph.create_node(
+                                    "call_function",
+                                    squeeze_op,
+                                    (
+                                        qdq_node_after_conv2d,
+                                        [2],
+                                    ),
+                                )
+                                squeeze_node.meta = copy_meta(node.meta)
 
                 for user in node.users.copy():
                     user.replace_input_with(node, squeeze_node)
diff --git a/backends/qualcomm/tests/models.py b/backends/qualcomm/tests/models.py
index da6b4bec66c..08a425147ad 100644
--- a/backends/qualcomm/tests/models.py
+++ b/backends/qualcomm/tests/models.py
@@ -490,6 +490,24 @@ def example_inputs(self):
         }
 
 
+class Conv1dBn(torch.nn.Module):
+    def __init__(self, bias=True):
+        super().__init__()
+        self.conv = torch.nn.Conv1d(
+            in_channels=2048,
+            out_channels=2048,
+            kernel_size=15,
+            groups=2048,
+            bias=bias,
+        )
+        self.batch_norm = torch.nn.BatchNorm1d(2048)
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.batch_norm(x)
+        return x
+
+
 class Conv1dSequential(torch.nn.Module):
     def __init__(self, bias=True):
         super().__init__()
diff --git a/backends/qualcomm/tests/test_qnn_delegate.py b/backends/qualcomm/tests/test_qnn_delegate.py
index 4e23f43c2ea..52b7c9eff9c 100644
--- a/backends/qualcomm/tests/test_qnn_delegate.py
+++ b/backends/qualcomm/tests/test_qnn_delegate.py
@@ -377,6 +377,13 @@ def test_qnn_backend_conv1d(self):
             with self.subTest(i=i):
                 self.lower_module_and_test_output(module, sample_input)
 
+    def test_qnn_conv1d_batch_norm(self):
+        modules = [Conv1dBn(), Conv1dBn(bias=False)]  # noqa: F405
+        sample_input = (torch.randn([1, 2048, 858]),)
+        for i, module in enumerate(modules):
+            with self.subTest(i=i):
+                self.lower_module_and_test_output(module, sample_input)
+
     def test_qnn_backend_conv2d(self):
         modules = [Conv2dSequential(), Conv2dSequential(bias=False)]  # noqa: F405
         sample_input = (torch.randn([1, 1, 3, 3]),)
@@ -2637,6 +2644,14 @@ def test_qnn_backend_conv1d(self):
                 module = self.get_qdq_module(module, sample_input)
                 self.lower_module_and_test_output(module, sample_input)
 
+    def test_qnn_conv1d_batch_norm(self):
+        modules = [Conv1dBn(), Conv1dBn(bias=False)]  # noqa: F405
+        sample_input = (torch.randn([1, 2048, 858]),)
+        for i, module in enumerate(modules):
+            with self.subTest(i=i):
+                module = self.get_qdq_module(module, sample_input)
+                self.lower_module_and_test_output(module, sample_input)
+
     def test_qnn_backend_conv2d(self):
         modules = [Conv2dSequential(), Conv2dSequential(bias=False)]  # noqa: F405
         sample_input = (torch.randn([1, 1, 3, 3]),)
@@ -6870,6 +6885,11 @@ class MLLMSpecs:
         tok_embedding_pte_size: float
         decoder_pte_size: float
 
+    @dataclass(frozen=True)
+    class ALMSpecs(MLLMSpecs):
+        audio_path: str
+        golden_audio_feature: str
+
     @dataclass(frozen=True)
     class VLMSpecs(MLLMSpecs):
         image_path: str
@@ -6877,6 +6897,18 @@ class VLMSpecs(MLLMSpecs):
 
     # TODO: refactor to support different backends
     def setUp(self):
+        self.alm_specs = {
+            "granite_speech_3_3-2b": TestExampleMultimodalityScript.ALMSpecs(
+                max_seq_len=512,
+                sm8650_token_rate=5,
+                sm8750_token_rate=8,
+                encoder_pte_size=900_000_000,  # 900MB
+                tok_embedding_pte_size=240_000_000,  # 240MB
+                decoder_pte_size=3_000_000_000,  # 3GB
+                audio_path="https://huggingface.co/ibm-granite/granite-speech-3.3-2b/resolve/main/10226_10111_000000.wav?download=true",  # Audio content: after his nap,...
+                golden_audio_feature="after his nap,",
+            ),
+        }
         self.vlm_specs = {
             "smolvlm_500m_instruct": TestExampleMultimodalityScript.VLMSpecs(
                 max_seq_len=128,
@@ -6900,6 +6932,96 @@ def setUp(self):
             ),
         }
 
+    def test_static_asr(self):
+        if not self.required_envs([self.model_name]):
+            self.skipTest("missing required envs")
+
+        if self.enable_x86_64:
+            # Running on host is extremely slow for large models, so we skip this check to avoid timeouts.
+            # Please verify the output on the actual device instead.
+            self.skipTest(
+                "Skipping the check for the static ASR model on x86 due to long execution time."
+            )
+
+        alm_specs: TestExampleMultimodalityScript.ALMSpecs = self.alm_specs[
+            self.model_name
+        ]
+        prompt = "can you transcribe the speech into a written format?"
+        audio_path = alm_specs.audio_path
+        cmds = [
+            "python",
+            f"{self.executorch_root}/examples/qualcomm/oss_scripts/llama/llama.py",
+            "--artifact",
+            self.artifact_dir,
+            "--build_folder",
+            self.build_folder,
+            "--model",
+            self.model,
+            "--ip",
+            self.ip,
+            "--port",
+            str(self.port),
+            "--prompt",
+            prompt,
+            "--audio_path",
+            audio_path,
+            "--temperature",
+            "0",
+            "--decoder_model",
+            f"{self.model_name}",
+            "--model_mode",
+            "kv",
+            "--max_seq_len",
+            f"{alm_specs.max_seq_len}",
+        ]
+        if self.compile_only:
+            cmds.extend(["--compile_only"])
+        elif self.device:
+            cmds.extend(["--device", self.device])
+        if self.host:
+            cmds.extend(["--host", self.host])
+        if self.pre_gen_pte:
+            cmds.extend(["--pre_gen_pte", self.pre_gen_pte])
+
+        p = subprocess.Popen(cmds, stdout=subprocess.DEVNULL)
+        with Listener((self.ip, self.port)) as listener:
+            conn = listener.accept()
+            p.communicate()
+            msg = json.loads(conn.recv())
+            if "Error" in msg:
+                self.fail(msg["Error"])
+            else:
+                if not self.compile_only:
+                    model_out = msg["result"][0]
+                    self.assertTrue(
+                        alm_specs.golden_audio_feature in model_out.lower(),
+                        f"Expected Output contains feature: '{alm_specs.golden_audio_feature}'  Actual Output: '{model_out}'",
+                    )
+                    print(f"Audio Path: {audio_path}")
+                    print(f"Query: {prompt}")
+                    print(f"Answer: {model_out}")
+
+                encoder_pte_size = msg["audio_encoder_pte_size"]
+                tok_embedding_pte_size = msg["tok_embedding_pte_size"]
+                decoder_pte_size = msg["pte_size"]
+                self.assertLessEqual(encoder_pte_size, alm_specs.encoder_pte_size)
+                self.assertLessEqual(
+                    tok_embedding_pte_size, alm_specs.tok_embedding_pte_size
+                )
+                self.assertLessEqual(decoder_pte_size, alm_specs.decoder_pte_size)
+                print(f"Encoder PTE Size: {encoder_pte_size} bytes")
+                print(f"Token Embedding PTE Size: {tok_embedding_pte_size} bytes")
+                print(f"Text Decoder PTE Size: {decoder_pte_size} bytes")
+
+                attr_name = f"{self.model.lower()}_token_rate"
+                if not self.compile_only and hasattr(alm_specs, attr_name):
+                    device_inference_speed = msg["inference_speed"]
+                    expected_inference_speed = getattr(alm_specs, attr_name)
+                    print(f"Prompt Evaluation: {device_inference_speed} tokens/second")
+                    self.assertGreaterEqual(
+                        device_inference_speed, expected_inference_speed
+                    )
+
     def test_static_vlm(self):
         if not self.required_envs([self.model_name]):
             self.skipTest("missing required envs")
@@ -6964,7 +7086,7 @@ def test_static_vlm(self):
                     print(f"Query: {prompt}")
                     print(f"Answer: {model_out}")
                 if not self.enable_x86_64:
-                    encoder_pte_size = msg["encoder_pte_size"]
+                    encoder_pte_size = msg["vision_encoder_pte_size"]
                     tok_embedding_pte_size = msg["tok_embedding_pte_size"]
                     decoder_pte_size = msg["pte_size"]
                     self.assertLessEqual(encoder_pte_size, vlm_specs.encoder_pte_size)
diff --git a/examples/models/granite_speech/BUCK b/examples/models/granite_speech/BUCK
new file mode 100644
index 00000000000..9660c0cad90
--- /dev/null
+++ b/examples/models/granite_speech/BUCK
@@ -0,0 +1,24 @@
+load("@fbcode_macros//build_defs:build_file_migration.bzl", "fbcode_target", "non_fbcode_target")
+load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
+
+oncall("executorch")
+
+fbcode_target(_kind = runtime.python_library,
+    name = "granite_speech",
+    srcs = [
+        "__init__.py",
+        "convert_weights.py",
+    ],
+    _is_external_target = True,
+    base_module = "executorch.examples.models.granite_speech",
+    resources = {
+        "config/2b_config.json": "config/2b_config.json",
+    },
+    deps = [
+        "//caffe2:torch",
+        "//executorch/examples/models/llama:llama2_model",
+        "fbcode//pytorch/torchtune:lib",
+        "fbsource//third-party/pypi/safetensors:safetensors",
+    ],
+    visibility = ["PUBLIC"],
+)
diff --git a/examples/models/granite_speech/__init__.py b/examples/models/granite_speech/__init__.py
new file mode 100644
index 00000000000..8adefab4ed2
--- /dev/null
+++ b/examples/models/granite_speech/__init__.py
@@ -0,0 +1,16 @@
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from executorch.examples.models.granite_speech.convert_weights import convert_weights
+from executorch.examples.models.llama.model import Llama2Model
+
+
+class GraniteSpeechModel(Llama2Model):
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+
+
+__all__ = [
+    "GraniteSpeechModel",
+    "convert_weights",
+]
diff --git a/examples/models/granite_speech/config/2b_config.json b/examples/models/granite_speech/config/2b_config.json
new file mode 100644
index 00000000000..f96e84f8f03
--- /dev/null
+++ b/examples/models/granite_speech/config/2b_config.json
@@ -0,0 +1,19 @@
+{
+  "dim": 2048,
+  "attention_qkv_bias": false,
+  "attention_multiplier": 0.015625,
+  "bos_idx": 0,
+  "embedding_scale_factor": 12.0,
+  "eos_idx": 0,
+  "act_fn": "silu",
+  "hidden_dim": 8192,
+  "n_heads": 32,
+  "n_layers": 40,
+  "n_kv_heads": 8,
+  "norm_eps": 1e-05,
+  "rope_theta": 10000000.0,
+  "vocab_size": 49160,
+  "use_hf_rope": false,
+  "residual_multiplier": 0.22,
+  "logits_scaling": 8.0
+}
diff --git a/examples/models/granite_speech/convert_weights.py b/examples/models/granite_speech/convert_weights.py
new file mode 100644
index 00000000000..1f3b1a5b731
--- /dev/null
+++ b/examples/models/granite_speech/convert_weights.py
@@ -0,0 +1,111 @@
+import argparse
+
+import json
+import os
+from typing import Dict
+
+import torch
+from safetensors.torch import load_file
+
+from torchtune.models.convert_weights import get_mapped_key
+
+
+# Weight mappings from Granite-Speech's checkpoint to ExecuTorch's transformer parameters.
+_GRANITE_TO_EXECUTORCH = {
+    "language_model.model.embed_tokens.weight": "tok_embeddings.weight",
+    "language_model.model.norm.weight": "norm.weight",
+    "language_model.model.layers.{}.self_attn.k_proj.weight": "layers.{}.attention.wk.weight",
+    "language_model.model.layers.{}.self_attn.q_proj.weight": "layers.{}.attention.wq.weight",
+    "language_model.model.layers.{}.self_attn.v_proj.weight": "layers.{}.attention.wv.weight",
+    "language_model.model.layers.{}.self_attn.o_proj.weight": "layers.{}.attention.wo.weight",
+    "language_model.model.layers.{}.input_layernorm.weight": "layers.{}.attention_norm.weight",
+    "language_model.model.layers.{}.post_attention_layernorm.weight": "layers.{}.ffn_norm.weight",
+    "language_model.model.layers.{}.mlp.gate_proj.weight": "layers.{}.feed_forward.w1.weight",
+    "language_model.model.layers.{}.mlp.down_proj.weight": "layers.{}.feed_forward.w2.weight",
+    "language_model.model.layers.{}.mlp.up_proj.weight": "layers.{}.feed_forward.w3.weight",
+}
+
+
+def granite_to_executorch(
+    state_dict: Dict[str, torch.Tensor]
+) -> Dict[str, torch.Tensor]:
+    """
+    Convert the state dict so that it matches what ExecuTorch's transformer definition expects.
+    """
+    converted_state_dict = {}
+    for key, value in state_dict.items():
+        try:
+            new_key = get_mapped_key(key, _GRANITE_TO_EXECUTORCH)
+            converted_state_dict[new_key] = value
+        except:
+            # only preserve parameters of text decoder
+            pass
+
+    converted_state_dict["output.weight"] = converted_state_dict[
+        "tok_embeddings.weight"
+    ]
+    return converted_state_dict
+
+
+def load_checkpoint_from_safetensors(input_dir: str) -> Dict:
+    index_path = os.path.join(input_dir, "model.safetensors.index.json")
+    if os.path.exists(index_path):
+        # Sharded checkpoint.
+        with open(index_path, "r") as f:
+            index = json.load(f)
+        weight_map = index["weight_map"]
+        checkpoint_shards = sorted(set(weight_map.values()))
+
+        # Load all the shards into memory
+        shard_to_weights = {}
+        for shard in checkpoint_shards:
+            shard_to_weights[shard] = load_file(os.path.join(input_dir, shard))
+
+        # Merge tensors into consolidated state dict.
+        merged_state_dict = {}
+        for weight_name, shard in weight_map.items():
+            tensor = shard_to_weights[shard][weight_name]
+            merged_state_dict[weight_name] = tensor
+        return merged_state_dict
+    else:
+        # Single checkpoint.
+        state_dict = load_file(os.path.join(input_dir, "model.safetensors"))
+        return state_dict
+
+
+def load_checkpoint(input_dir: str) -> Dict:
+    pytorch_path = os.path.join(input_dir, "pytorch_model.bin")
+    if os.path.exists(pytorch_path):
+        print("Loading checkpoint from PyTorch .bin file")
+        return torch.load(pytorch_path, map_location="cpu", weights_only=True)
+    print("Loading checkpoint from safetensors directory")
+    return load_checkpoint_from_safetensors(input_dir)
+
+
+def convert_weights(input_dir: str, output_file: str) -> None:
+    print("Loading checkpoint...")
+    sd = load_checkpoint(input_dir)
+    print("Converting checkpoint...")
+    sd = granite_to_executorch(sd)
+    print("Saving checkpoint...")
+    torch.save(sd, output_file)
+    print("Done.")
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Convert Granite-Speech weights to ExecuTorch transformer format."
+    )
+    parser.add_argument(
+        "input_dir",
+        type=str,
+        help="Path to directory containing safetensor checkpoint files, or PyTorch checkpoint file.",
+    )
+    parser.add_argument("output", type=str, help="Path to the output checkpoint")
+
+    args = parser.parse_args()
+    convert_weights(args.input_dir, args.output)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/qualcomm/oss_scripts/llama/README.md b/examples/qualcomm/oss_scripts/llama/README.md
index fb926e9f613..794b4c11b33 100644
--- a/examples/qualcomm/oss_scripts/llama/README.md
+++ b/examples/qualcomm/oss_scripts/llama/README.md
@@ -4,8 +4,8 @@
 
 **Video Tutorial:** [Build Along: Run LLMs Locally on Qualcomm Hardware Using ExecuTorch](https://www.youtube.com/watch?v=41PKDlGM3oU)
 
-This file provides you the instructions to run LLM Decoder model and VLM model with different parameters via Qualcomm HTP backend. We currently support the following models:
-- LLM
+This file provides you the instructions to run LLM Decoder model, VLM model, and ALM model with different parameters via Qualcomm HTP backend. We currently support the following models:
+- Large language models
 <!-- numbered list will be automatically generated -->
  1. LLAMA2 Stories 110M
  1. LLAMA3.2 1B
@@ -21,10 +21,13 @@ This file provides you the instructions to run LLM Decoder model and VLM model w
  1. QWEN3 0.6B / 1.7B
  1. SmolLM2 135M
  1. SmolLM3 3B
-- VLM
+- Vision-Language Models
 <!-- numbered list will be automatically generated -->
  1. SmolVLM 500M
  1. InternVL3 1B
+- Audio-Language models
+<!-- numbered list will be automatically generated -->
+ 1. Granite-speech-3.3-2b
 
 We offer the following modes to execute the model:
 
@@ -215,7 +218,7 @@ Multimodal models extend LLM by processing multiple input modalities (vision, au
 
 **Current Support Status:**
 - **Vision-Language Models (VLM)**: Fully supported
-- **Audio-Language Models (ALM)**: Coming soon
+- **Audio-Language Models (ALM)**: Fully supported
 
 ### Multimodal Architecture
 
@@ -228,7 +231,7 @@ Multimodal inference follows these key stages:
 
 1. **Modality-Specific Encoding**
    - **Vision**: Images are processed through a vision encoder to generate visual embeddings
-   - **Audio**: Audio waveforms are processed through an audio encoder *(future support)*
+   - **Audio**: Audio waveforms are processed through an audio encoder to generate audio embeddings
    - **Text**: Text prompts are tokenized and embedded
 
 2. **Embedding Fusion**
@@ -242,12 +245,105 @@ Multimodal inference follows these key stages:
 
 ---
 
+## Audio-Language Model (ALM) Support
+
+Audio-Language Models (ALMs) combine speech/audio processing and natural language processing to understand and generate text based on audio inputs. ALMs in this framework consist of:
+
+### Dependencies
+
+ALM models require the `soundfile` package for audio loading:
+
+```bash
+pip install soundfile
+```
+
+
+- **[Audio Encoder](model/audio_encoder.py)**: Processes raw audio waveforms into audio embeddings (e.g., CTC encoder for Granite-speech)
+  - **Projector** (included in audio encoder): Aligns audio embeddings with the language model's embedding space.
+- **[Language Decoder](model/static_llama.py)**: Reuse static llama to generates text based on fused visual and text embeddings.
+
+### Instructions
+
+#### Granite-speech-3.3-2b
+
+Default example using hybrid mode.
+```bash
+python examples/qualcomm/oss_scripts/llama/llama.py -b build-android -s ${SERIAL_NUM} -m ${SOC_MODEL} --decoder_model granite_speech_3_3-2b --model_mode hybrid --prefill_ar_len 128 --max_seq_len 1024 --prompt "can you transcribe the speech into a written format?" --audio_path "https://huggingface.co/ibm-granite/granite-speech-3.3-2b/resolve/main/10226_10111_000000.wav?download=true"
+```
+
+### Specifying Custom Audio
+
+You can specify a custom audio file for ALM models using the `--audio_path` flag:
+- **HTTP/HTTPS URLs**: Direct links to audio on the web
+  - Example: `"https://huggingface.co/ibm-granite/granite-speech-3.3-2b/resolve/main/10226_10111_000000.wav?download=true"`
+- **HuggingFace repository filenames**: Files that exist in the model's HuggingFace repository are automatically downloaded
+  - Example: `"10226_10111_000000.wav"` (auto-downloaded from `ibm-granite/granite-speech-3.3-2b`)
+- **Local file paths**: Absolute or relative paths to `.wav` files on your system
+  - Example: `"/path/to/your/audio.wav"`
+
+**Default behavior:**
+If `--audio_path` is not specified, the system will automatically use the default audio file defined in the model's configuration file (`encoder/encoder_config.py`).
+
+#### Audio Preprocessing
+
+The audio encoder configuration is defined in `encoder/encoder_config.py`:
+
+```python
+# In encoder/encoder_config.py
+@dataclass(init=False, frozen=True)
+class GraniteSpeechEncoder(AudioModalityConfig):
+    encoder_class = GraniteSpeechCTCEncoderWrapper
+    audio_seq_len = 171
+    audio_url = "https://huggingface.co/ibm-granite/granite-speech-3.3-2b/resolve/main/10226_10111_000000.wav?download=true"  # Default audio (content: "After his nap, ...")
+    quant_recipe = GraniteSpeechEncoderQuantRecipe
+```
+
+- **audio_seq_len**: Number of audio tokens generated by the encoder.
+
+The audio is automatically:
+1. Loaded from the specified file path or downloaded from HuggingFace
+2. Read as a waveform using `soundfile` and converted to a float tensor of shape `[1, T]`
+3. Processed by the HuggingFace `AutoProcessor` to produce mel-filterbank features of shape `(1, 844, 160)`
+4. Passed through the CTC encoder and QFormer projector to produce audio embeddings of shape `[1, audio_seq_len, hidden_dim]`
+
+### Using Pre-Generated PTE Files
+
+If you have already compiled a ALM model, you can run inference with pre-generated PTE files:
+
+```bash
+python examples/qualcomm/oss_scripts/llama/llama.py -b build-android -s ${SERIAL_NUM} -m ${SOC_MODEL} --decoder_model granite_speech_3_3-2b --model_mode hybrid  --prefill_ar_len 128 --max_seq_len 1024 --prompt "can you transcribe the speech into a written format?" --audio_path "https://huggingface.co/ibm-granite/granite-speech-3.3-2b/resolve/main/10226_10111_000000.wav?download=true" --pre_gen_pte ${FOLDER_TO_PRE_GEN_PTE}
+```
+
+### ALM Processing Details
+
+The ALM inference pipeline consists of:
+
+1. **Audio Encoding Phase**
+   - Input audio waveform is loaded and preprocessed into mel-filterbank features: `(1, 844, 160)`
+   - CTC encoder extracts acoustic features using Conformer blocks with block-wise local attention
+   - QFormer projector aligns audio embeddings to the language model dimension: `[batch, audio_seq_len, hidden_dim]`
+
+2. **Text Tokenization Phase**
+   - User prompt is tokenized into text tokens
+   - Text tokens are embedded: `[batch, text_seq_len, hidden_dim]`
+
+3. **Embedding Fusion Phase**
+   - Audio and text embeddings are concatenated according to the model's template
+   - The `<audio>` placeholder in the prompt is expanded to the model-specific special token `<|audio|>`
+   - Final fused sequence: `[batch, audio_seq_len + text_seq_len, hidden_dim]`
+
+4. **Language Generation Phase**
+   - Fused embeddings are fed into the language decoder
+   - Autoregressive generation produces output tokens using KV cache mode
+   - KV cache is updated for efficient subsequent token generation
+
+
 ## Vision-Language Model (VLM) Support
 
 Vision-Language Models (VLMs) combine computer vision and natural language processing to understand and generate text based on visual inputs. VLMs in this framework consist of:
 
 - **[Vision Encoder](model/vision_encoder.py)**: Processes images into visual embeddings (e.g., SigLIP for SmolVLM)
-  - **Projection Layer** (included in vision encoder): Aligns visual embeddings with the language model's embedding space
+  - **Projector** (included in vision encoder): Aligns visual embeddings with the language model's embedding space
 - **[Language Decoder](model/static_llama.py)**: Reuse static llama to generates text based on fused visual and text embeddings
 
 ### Instructions
@@ -290,7 +386,7 @@ class SmolVLMEncoder(VisionModalityConfig):
     img_resized_h = 512
     img_resized_w = 512
     img_url = "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg"  # Default image
-    quant_recipe = SmolVLM_Encoder_QuantRecipe
+    quant_recipe = SmolVLMEncoderQuantRecipe
 ```
 
 - **img_resized_h / img_resized_w**: Target resolution for the vision encoder
diff --git a/examples/qualcomm/oss_scripts/llama/__init__.py b/examples/qualcomm/oss_scripts/llama/__init__.py
index b28b4752c12..2e0b2278909 100644
--- a/examples/qualcomm/oss_scripts/llama/__init__.py
+++ b/examples/qualcomm/oss_scripts/llama/__init__.py
@@ -9,7 +9,7 @@
 from dataclasses import dataclass
 from enum import Enum
 from functools import partial
-from typing import Callable, Dict, Optional, Type
+from typing import Callable, Dict, Optional, Sequence, Type, Union
 
 from executorch.examples.models.codegen import (
     convert_weights as convert_codegen_weights,
@@ -22,6 +22,9 @@
 from executorch.examples.models.granite import (
     convert_weights as convert_granite_weights,
 )
+from executorch.examples.models.granite_speech import (
+    convert_weights as convert_granite_speech_weights,
+)
 from executorch.examples.models.internvl3 import (
     convert_weights as convert_internvl3_weights,
 )
@@ -43,14 +46,18 @@
 )
 
 from executorch.examples.qualcomm.oss_scripts.llama.decoder_constants import (
+    AUDIO_ENCODER,
     DECODER_MODEL_VERSION,
     VISION_ENCODER,
 )
 
 from executorch.examples.qualcomm.oss_scripts.llama.encoder.encoder_config import (
+    AudioModalityConfig,
+    GraniteSpeechEncoder,
     InternVL3Encoder,
     MultiModalityConfig,
     SmolVLMEncoder,
+    VisionModalityConfig,
 )
 from executorch.examples.qualcomm.oss_scripts.llama.model.static_llama import (
     LlamaModel,
@@ -65,6 +72,7 @@
     Gemma_2BQuantRecipe,
     GLM_1_5B_InstructQuantRecipe,
     Granite_3_3_2B_InstructQuantRecipe,
+    GraniteSpeech_3_3_2B_InstructQuantRecipe,
     InternVL3_1B_QuantRecipe,
     Llama3_1BQuantRecipe,
     Llama3_3BQuantRecipe,
@@ -88,6 +96,7 @@
 
 LLM_VARIANT_ARCHS: Dict[str, LlamaModel] = {
     "gemma3-1b": MultiScopeAwareLlamaModel,
+    "granite_speech_3_3-2b": LlamaModelWithoutEmbedding,
     "smolvlm_500m_instruct": LlamaModelWithoutEmbedding,
     "internvl3_1b": LlamaModelWithoutEmbedding,
     "gemma2-2b": MultiScopeAwareLlamaModel,
@@ -183,14 +192,28 @@ def format_value(v):
 
 def register_llm_model(
     name: str,
-    vision_encoder: Optional[MultiModalityConfig] = None,
+    modality_encoders: Optional[
+        Union[MultiModalityConfig, Sequence[MultiModalityConfig]]
+    ] = None,
 ):
     def decorator(cls: Type[LLMModelConfig]):
         cls.decoder_model_version = DECODER_MODEL_VERSION[name]
-        if vision_encoder is not None and issubclass(
-            vision_encoder, MultiModalityConfig
-        ):
-            setattr(cls, VISION_ENCODER, vision_encoder)
+
+        encs = (
+            modality_encoders
+            if isinstance(modality_encoders, (list, tuple))
+            else (modality_encoders,)
+        )
+
+        for enc in encs:
+            if enc is None:
+                continue
+            if issubclass(enc, AudioModalityConfig):
+                setattr(cls, AUDIO_ENCODER, enc)
+            elif issubclass(enc, VisionModalityConfig):
+                setattr(cls, VISION_ENCODER, enc)
+            else:
+                raise ValueError(f"Unsupported encoder type {enc} for model {name}.")
         SUPPORTED_LLM_MODELS[name.lower()] = cls()
         return cls()
 
@@ -384,6 +407,28 @@ class Granite_3_3_2b_Instruct(LLMModelConfig):
     quant_recipe = Granite_3_3_2B_InstructQuantRecipe
 
 
+@register_llm_model(
+    "granite_speech_3_3-2b",
+    modality_encoders=GraniteSpeechEncoder,
+)
+@dataclass(init=False, frozen=True)
+class GraniteSpeech_3_3_2b(LLMModelConfig):
+    repo_id: str = "ibm-granite/granite-speech-3.3-2b"
+    params_path: str = os.path.join(
+        BASE_DIR, "../../../models/granite_speech/config/2b_config.json"
+    )
+    convert_weights = convert_granite_speech_weights
+    transform_weight = False
+    instruct_model = True
+    num_sharding = 4
+    masked_softmax = True
+    seq_mse_candidates = 0
+    r1 = False
+    r2 = False
+    r3 = False
+    quant_recipe = GraniteSpeech_3_3_2B_InstructQuantRecipe
+
+
 @register_llm_model("phi_4_mini")
 @dataclass(init=False, frozen=True)
 class Phi4Mini(LLMModelConfig):
@@ -517,7 +562,7 @@ class Smollm3_3B(LLMModelConfig):
 
 @register_llm_model(
     "internvl3_1b",
-    vision_encoder=InternVL3Encoder,
+    modality_encoders=InternVL3Encoder,
 )
 @dataclass(init=False, frozen=True)
 class InternVL3_1B(LLMModelConfig):
@@ -539,7 +584,7 @@ class InternVL3_1B(LLMModelConfig):
 
 @register_llm_model(
     "smolvlm_500m_instruct",
-    vision_encoder=SmolVLMEncoder,
+    modality_encoders=SmolVLMEncoder,
 )
 @dataclass(init=False, frozen=True)
 class SmolVLM_500M(LLMModelConfig):
diff --git a/examples/qualcomm/oss_scripts/llama/dataset.py b/examples/qualcomm/oss_scripts/llama/dataset.py
index 2994baaafaa..052b173b216 100644
--- a/examples/qualcomm/oss_scripts/llama/dataset.py
+++ b/examples/qualcomm/oss_scripts/llama/dataset.py
@@ -5,8 +5,13 @@
 # LICENSE file in the root directory of this source tree.
 
 import argparse
+import io
+import os
 from typing import Callable, Dict, List, Optional
 
+import requests
+
+import torch
 from executorch.examples.qualcomm.oss_scripts.llama import LLMModelConfig
 from executorch.examples.qualcomm.oss_scripts.llama.decoder_constants import (
     AUDIO_ENCODER,
@@ -15,12 +20,13 @@
     TOK_EMBEDDING,
     VISION_ENCODER,
 )
-
 from executorch.examples.qualcomm.oss_scripts.llama.encoder.encoder_config import (
+    AudioModalityConfig,
     MultiModalityConfig,
     VisionModalityConfig,
 )
 from executorch.examples.qualcomm.oss_scripts.llama.tokenizer import TokenizerWrapper
+from huggingface_hub import hf_hub_download
 from transformers import AutoProcessor
 from transformers.image_utils import load_image
 
@@ -40,6 +46,56 @@ def __init__(
         self.artifact = control_args.artifact
         self.repo_id = config.repo_id
 
+    def _build_audio_dataset(
+        self, config: AudioModalityConfig, prompt: str, files_path: List[str]
+    ):
+        """
+        This will process audio using the HuggingFace processor and save
+        the processed audio for runtime evaluation.
+
+        Args:
+            config (AudioModalityConfig): containing audio URL
+            prompt (str): Text prompt to be processed alongside the audio
+
+        Returns:
+            tuple of audio feature tensors
+        """
+        try:
+            import soundfile
+        except ImportError:
+            raise ImportError(
+                "Please install the `soundfile` package via `pip install soundfile` for audio data loading"
+            )
+
+        dataset = []
+        processor = AutoProcessor.from_pretrained(self.repo_id)
+        for audio_path in files_path:
+            if isinstance(audio_path, str) and audio_path.startswith(
+                ("http://", "https://")
+            ):
+                resp = requests.get(audio_path, timeout=60)
+                resp.raise_for_status()
+                data = io.BytesIO(resp.content)
+                wav, sr = soundfile.read(data, always_2d=False)
+            else:
+                if not os.path.exists(audio_path):
+                    try:
+                        audio_path = hf_hub_download(
+                            repo_id=self.config.repo_id, filename=audio_path
+                        )
+                    except Exception:
+                        raise FileNotFoundError(
+                            f"Audio file {audio_path} not found locally or in HuggingFace repository {self.config.repo_id}."
+                        )
+                wav, sr = soundfile.read(audio_path, always_2d=False)
+            wav = torch.from_numpy(wav).float().unsqueeze(0)  # [1, T]
+
+            # Process audio with text prompt using HuggingFace processor
+            input_features = processor(prompt, wav, return_tensors="pt").input_features
+            dataset.append((input_features,))
+
+        return dataset
+
     def _build_vision_dataset(
         self, config: VisionModalityConfig, prompt: str, files_path: List[str]
     ):
@@ -90,14 +146,13 @@ def _build_dataset_for_encoder(
         prompt: str,
         files_path: List[str],
     ) -> Optional[tuple]:
-        if issubclass(config, VisionModalityConfig):
+        if issubclass(config, AudioModalityConfig):
+            return self._build_audio_dataset(config, prompt, files_path)
+        elif issubclass(config, VisionModalityConfig):
             return self._build_vision_dataset(config, prompt, files_path)
         else:
-            # Audio and text encoder dataset building are not yet implemented
-            # TODO: Add support for AudioModalityConfig and TextModalityConfig
             raise NotImplementedError(
                 f"Dataset building for {config} is not yet supported. "
-                f"Currently only VisionModalityConfig is implemented."
             )
 
     def prepare_calibration_dataset(
diff --git a/examples/qualcomm/oss_scripts/llama/decoder_constants.py b/examples/qualcomm/oss_scripts/llama/decoder_constants.py
index 7a4c3e20be6..cccf368459b 100644
--- a/examples/qualcomm/oss_scripts/llama/decoder_constants.py
+++ b/examples/qualcomm/oss_scripts/llama/decoder_constants.py
@@ -22,6 +22,7 @@
 
 # Mapping of input flags for the runner
 MODALITY_INPUT_FLAG_MAP = {
+    AUDIO_ENCODER: "audio_path",
     VISION_ENCODER: "image_path",
 }
 
@@ -51,6 +52,7 @@
     "gemma2-2b": "gemma2",
     "gemma3-1b": "gemma3",
     "granite_3_3-2b_instruct": "granite",
+    "granite_speech_3_3-2b": "granite_speech",
     "phi_4_mini": "phi_4_mini",
     "qwen2_5-0_5b": "qwen2_5",
     "qwen2_5-1_5b": "qwen2_5",
diff --git a/examples/qualcomm/oss_scripts/llama/decoder_runtime_evaluator.py b/examples/qualcomm/oss_scripts/llama/decoder_runtime_evaluator.py
index 55d7409a1e6..92f8c518058 100644
--- a/examples/qualcomm/oss_scripts/llama/decoder_runtime_evaluator.py
+++ b/examples/qualcomm/oss_scripts/llama/decoder_runtime_evaluator.py
@@ -18,6 +18,7 @@
 from executorch.examples.models.llama.evaluate.eager_eval import EagerEvalWrapper
 from executorch.examples.qualcomm.oss_scripts.llama.decoder_constants import (
     ATTENTION_SINK_EVICTOR,
+    AUDIO_ENCODER,
     DECODER_MODEL_VERSION,
     EVAL_MODE,
     MODALITY_INPUT_FLAG_MAP,
@@ -136,11 +137,19 @@ def _init_runner_base_cmd(self):
                 ]
             )
             if self.is_multimodal:
+                encoder_path = self.pte_paths[
+                    next(
+                        filter(
+                            lambda m: m in self.pte_paths,
+                            [AUDIO_ENCODER, VISION_ENCODER],
+                        )
+                    )
+                ]
                 base_cmd = " ".join(
                     [
                         base_cmd,
                         f"--decoder_path {self.pte_paths[TEXT_DECODER]}",
-                        f"--encoder_path {self.pte_paths[VISION_ENCODER]}",
+                        f"--encoder_path {encoder_path}",
                         f"--tok_embedding_path {self.pte_paths[TOK_EMBEDDING]}",
                     ]
                 )
@@ -170,11 +179,19 @@ def _init_runner_base_cmd(self):
             )
 
             if self.is_multimodal:
+                encoder_path = self.pte_paths[
+                    next(
+                        filter(
+                            lambda m: m in self.pte_paths,
+                            [AUDIO_ENCODER, VISION_ENCODER],
+                        )
+                    )
+                ]
                 base_cmd = " ".join(
                     [
                         base_cmd,
                         f"--decoder_path {os.path.basename(self.pte_paths[TEXT_DECODER])}",
-                        f"--encoder_path {os.path.basename(self.pte_paths[VISION_ENCODER])}",
+                        f"--encoder_path {os.path.basename(encoder_path)}",
                         f"--tok_embedding_path {os.path.basename(self.pte_paths[TOK_EMBEDDING])}",
                     ]
                 )
@@ -237,12 +254,8 @@ def __init__(
 
         modality_input_cmd = []
         self.modality_input_files = []
-        for modality, data in modality_inputs.items():
-            if (
-                not modality_inputs[modality]
-                or modality not in MODALITY_INPUT_FLAG_MAP
-                or modality is TEXT_DECODER
-            ):
+        for modality, inputs in modality_inputs.items():
+            if not all([inputs[0], modality in MODALITY_INPUT_FLAG_MAP]):
                 continue
 
             # Specify the input list filename by it's modality.
@@ -252,11 +265,13 @@ def __init__(
             input_list_file, input_files = generate_inputs(
                 self.args.artifact,
                 input_list_filename=input_list_filename,
-                inputs=data,
+                inputs=inputs,
                 prefix_input_filename=modality,
             )
             self.modality_input_files.append(input_list_file)
             self.modality_input_files.extend(input_files)
+            if args.enable_x86_64:
+                input_list_filename = f"{self.args.artifact}/{input_list_filename}"
             modality_input_cmd.append(
                 f"--{MODALITY_INPUT_FLAG_MAP[modality]} {input_list_filename}"
             )
@@ -284,6 +299,13 @@ def __init__(
                 f"--seq_len {args.max_seq_len}",
             ]
         )
+        if self.is_multimodal:
+            self.runner_cmd = " ".join(
+                [
+                    self.runner_cmd,
+                    self.modality_input_cmd,
+                ]
+            )
 
     def run(self, prompt):
         multi_prompts = " ".join([f'--prompt "{p}"' for p in prompt])
@@ -293,7 +315,7 @@ def run(self, prompt):
 
         if self.args.enable_x86_64:
             # x86 emulator is intended for CI and not performance. Check only the first few tokens.
-            seq_len = min(self.args.max_seq_len, 32)
+            seq_len = min(self.args.max_seq_len, 320 if self.is_multimodal else 32)
             runner_cmd = " ".join(
                 [
                     self.runner_cmd,
@@ -323,7 +345,6 @@ def run(self, prompt):
             extra_files = [self.runtime_tokenizer_path]
             if self.is_multimodal:
                 extra_files.extend(self.modality_input_files)
-                runner_cmd = " ".join([runner_cmd, self.modality_input_cmd])
             self.adb.push(inputs=[], files=extra_files)
             self.adb.execute(custom_runner_cmd=runner_cmd)
             self.adb.pull(
diff --git a/examples/qualcomm/oss_scripts/llama/decoder_utils.py b/examples/qualcomm/oss_scripts/llama/decoder_utils.py
index d3261e1bb68..b88617350f3 100644
--- a/examples/qualcomm/oss_scripts/llama/decoder_utils.py
+++ b/examples/qualcomm/oss_scripts/llama/decoder_utils.py
@@ -668,6 +668,7 @@ def kv_inference(  # noqa: C901
     tokenizer,
     tok_embedding=None,
     hidden_states: Tuple = (),
+    audio_token_id=None,
     image_token_id=None,
     ar_len=1,
     max_seq_len=512,
@@ -678,8 +679,8 @@ def kv_inference(  # noqa: C901
 ):
     is_multimodal = all(
         [
-            tok_embedding is not None,
-            image_token_id is not None,
+            tok_embedding,
+            audio_token_id or image_token_id,
         ]
     )
 
@@ -754,7 +755,7 @@ def kv_inference(  # noqa: C901
                         :, :input_ids_len, :
                     ],  # Only use actual prompt length
                     torch.cat(hidden_states, dim=1),
-                    image_token_id,
+                    audio_token_id or image_token_id,
                 )
             else:
                 multimodal_embedding = text_embeddings[:, :input_ids_len, :]
@@ -816,6 +817,7 @@ def prefill_inference(
     tokenizer,
     tok_embedding=None,
     hidden_states=None,
+    audio_token_id=None,
     image_token_id=None,
     max_seq_len=512,
     use_i64_token=False,
@@ -823,9 +825,8 @@ def prefill_inference(
 ):
     is_multimodal = all(
         [
-            tok_embedding is not None,
-            hidden_states is not None,
-            image_token_id is not None,
+            tok_embedding,
+            audio_token_id or image_token_id,
         ]
     )
 
@@ -870,7 +871,7 @@ def prefill_inference(
                     tmp_token_list,
                     text_embeddings,
                     torch.cat(hidden_states, dim=1),
-                    image_token_id,
+                    audio_token_id or image_token_id,
                 )
                 results = module(multimodal_embedding, *atten_mask)
             else:
@@ -899,6 +900,7 @@ def graph_module_inference(
     prompt=None,
     tok_embedding=None,
     hidden_states: Tuple = (),
+    audio_token_id=None,
     image_token_id=None,
     tasks=None,
     tasks_limit=1,
@@ -930,6 +932,7 @@ def graph_module_inference(
             tokenizer,
             tok_embedding=tok_embedding,
             hidden_states=hidden_states,
+            audio_token_id=audio_token_id,
             image_token_id=image_token_id,
             max_seq_len=max_seq_len,
             use_i64_token=use_i64_token,
diff --git a/examples/qualcomm/oss_scripts/llama/encoder/__init__.py b/examples/qualcomm/oss_scripts/llama/encoder/__init__.py
index f3d821bcd46..73116c6b098 100644
--- a/examples/qualcomm/oss_scripts/llama/encoder/__init__.py
+++ b/examples/qualcomm/oss_scripts/llama/encoder/__init__.py
@@ -5,6 +5,7 @@
 # LICENSE file in the root directory of this source tree.
 
 from executorch.examples.qualcomm.oss_scripts.llama.encoder.encoder_config import (
+    AudioModalityConfig,
     InternVL3Encoder,
     MultiModalityConfig,
     SmolVLMEncoder,
@@ -12,16 +13,19 @@
 )
 from executorch.examples.qualcomm.oss_scripts.llama.encoder.encoder_quant_recipe import (
     EncoderQuantRecipe,
-    InternVL3_Encoder_QuantRecipe,
-    SmolVLM_Encoder_QuantRecipe,
+    GraniteSpeechEncoderQuantRecipe,
+    InternVL3EncoderQuantRecipe,
+    SmolVLMEncoderQuantRecipe,
 )
 
 __all__ = [
     "EncoderQuantRecipe",
     "InternVL3Encoder",
-    "InternVL3_Encoder_QuantRecipe",
+    "InternVL3EncoderQuantRecipe",
     "MultiModalityConfig",
     "SmolVLMEncoder",
-    "SmolVLM_Encoder_QuantRecipe",
+    "SmolVLMEncoderQuantRecipe",
     "VisionModalityConfig",
+    "AudioModalityConfig",
+    "GraniteSpeechEncoderQuantRecipe",
 ]
diff --git a/examples/qualcomm/oss_scripts/llama/encoder/encoder_config.py b/examples/qualcomm/oss_scripts/llama/encoder/encoder_config.py
index b8e32904bbf..086d4b6090b 100644
--- a/examples/qualcomm/oss_scripts/llama/encoder/encoder_config.py
+++ b/examples/qualcomm/oss_scripts/llama/encoder/encoder_config.py
@@ -9,10 +9,13 @@
 
 from executorch.examples.qualcomm.oss_scripts.llama.encoder.encoder_quant_recipe import (
     EncoderQuantRecipe,
-    InternVL3_Encoder_QuantRecipe,
-    SmolVLM_Encoder_QuantRecipe,
+    GraniteSpeechEncoderQuantRecipe,
+    InternVL3EncoderQuantRecipe,
+    SmolVLMEncoderQuantRecipe,
+)
+from executorch.examples.qualcomm.oss_scripts.llama.model.audio_encoder import (
+    GraniteSpeechCTCEncoderWrapper,
 )
-
 from executorch.examples.qualcomm.oss_scripts.llama.model.vision_encoder import (
     Idefics3VisionEncoder,
     InternVL3VisionEncoder,
@@ -22,21 +25,40 @@
 @dataclass(init=False, frozen=True)
 class MultiModalityConfig(ABC):
     """
-    Base configuration class for late fusion modality encoders.
+    Base config class for late fusion modality encoders.
 
     Attributes:
         encoder_class: The encoder class that implements the modality processing.
         quant_recipe: Quantization recipe for optimizing the encoder.
+        num_sharding: Specify the number of splits by inserting the fallback custom op. The graph will be split evenly by layers. Only larger encoder needs sharding, e.g., GraniteSpeech with 24 layers. For smaller encoders like SmolVLM with 12 layers, sharding is not necessary.
     """
 
     encoder_class: type
     quant_recipe: EncoderQuantRecipe
+    num_sharding: int = 1
 
     @abstractmethod
     def create_encoder(self, config):
         pass
 
 
+@dataclass(init=False, frozen=True)
+class AudioModalityConfig(MultiModalityConfig):
+    """
+    Base config for audio modality encoders.
+
+    Attributes:
+        audio_seq_len: Number of audio tokens in the sequence.
+        audio_url: Default audio URL for validation and calibration.
+    """
+
+    audio_seq_len: int
+    audio_url: str
+
+    def create_encoder(self, config):
+        return self.encoder_class(config)
+
+
 @dataclass(init=False, frozen=True)
 class VisionModalityConfig(MultiModalityConfig):
     """
@@ -62,6 +84,19 @@ def create_encoder(self, config):
         )
 
 
+@dataclass(init=False, frozen=True)
+class GraniteSpeechEncoder(AudioModalityConfig):
+    """
+    Config for GraniteSpeech audio encoder.
+    """
+
+    encoder_class = GraniteSpeechCTCEncoderWrapper
+    audio_seq_len = 171
+    audio_url = "https://huggingface.co/ibm-granite/granite-speech-3.3-2b/resolve/main/10226_10111_000000.wav?download=true"
+    quant_recipe = GraniteSpeechEncoderQuantRecipe
+    num_sharding = 8
+
+
 @dataclass(init=False, frozen=True)
 class SmolVLMEncoder(VisionModalityConfig):
     """
@@ -73,7 +108,7 @@ class SmolVLMEncoder(VisionModalityConfig):
     img_resized_h = 512
     img_resized_w = 512
     img_url = "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg"
-    quant_recipe = SmolVLM_Encoder_QuantRecipe
+    quant_recipe = SmolVLMEncoderQuantRecipe
 
 
 @dataclass(init=False, frozen=True)
@@ -87,4 +122,4 @@ class InternVL3Encoder(VisionModalityConfig):
     img_resized_h = 448
     img_resized_w = 448
     img_url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-    quant_recipe = InternVL3_Encoder_QuantRecipe
+    quant_recipe = InternVL3EncoderQuantRecipe
diff --git a/examples/qualcomm/oss_scripts/llama/encoder/encoder_quant_recipe.py b/examples/qualcomm/oss_scripts/llama/encoder/encoder_quant_recipe.py
index 58d34af1d45..fe5520b8407 100644
--- a/examples/qualcomm/oss_scripts/llama/encoder/encoder_quant_recipe.py
+++ b/examples/qualcomm/oss_scripts/llama/encoder/encoder_quant_recipe.py
@@ -12,7 +12,7 @@
     QuantRecipe,
 )
 from executorch.backends.qualcomm.quantizer.quantizer import QuantDtype
-from torchao.quantization.pt2e import MinMaxObserver
+from torchao.quantization.pt2e import HistogramObserver, MinMaxObserver
 
 
 class EncoderQuantRecipe:
@@ -30,8 +30,11 @@ def __init__(self):
     def annotate(self, graph_module: torch.fx.GraphModule):
         self.recipe.annotate(graph_module)
 
+    def get_act_bit_width(self) -> int:
+        return 32 if self.default_quant_dtype is None else 16
 
-class InternVL3_Encoder_QuantRecipe(EncoderQuantRecipe):
+
+class InternVL3EncoderQuantRecipe(EncoderQuantRecipe):
     default_quant_dtype = QuantDtype.use_16a8w
 
     def __init__(self, verbose: bool = False):
@@ -54,7 +57,7 @@ def __init__(self, verbose: bool = False):
         )
 
 
-class SmolVLM_Encoder_QuantRecipe(EncoderQuantRecipe):
+class SmolVLMEncoderQuantRecipe(EncoderQuantRecipe):
     default_quant_dtype = QuantDtype.use_16a8w
 
     def __init__(self, verbose: bool = False):
@@ -75,3 +78,41 @@ def __init__(self, verbose: bool = False):
             act_observer=MinMaxObserver,
             granularity=QuantGranularity.PER_CHANNEL,
         )
+
+
+class GraniteSpeechEncoderQuantRecipe(EncoderQuantRecipe):
+    default_quant_dtype = QuantDtype.use_16a8w
+
+    def __init__(self, verbose: bool = False):
+        super().__init__()
+
+        self.recipe = (
+            QuantRecipe(
+                self.default_quant_dtype,
+                False,
+                act_observer=HistogramObserver,
+                granularity=QuantGranularity.PER_TENSOR,
+                verbose=verbose,
+            )
+            .add_node_target(
+                {
+                    torch.ops.aten.linear.default,
+                    torch.ops.aten.conv1d.default,
+                },
+                QuantDtype.use_16a8w,
+                False,
+                act_observer=HistogramObserver,
+                granularity=QuantGranularity.PER_CHANNEL,
+            )
+            .add_regex(
+                {
+                    r"encoder\..*\.layers\..*\.conv\.up_conv",
+                    r"encoder\..*\.layers\..*\.conv\.down_conv",
+                },
+                QuantDtype.use_16a4w_block,
+                False,
+                act_observer=HistogramObserver,
+                granularity=QuantGranularity.PER_BLOCK,
+                extra_kwargs={"block_size": (1, 32, 1)},
+            )
+        )
diff --git a/examples/qualcomm/oss_scripts/llama/llama.py b/examples/qualcomm/oss_scripts/llama/llama.py
index 5449599acc2..32864d91f9e 100755
--- a/examples/qualcomm/oss_scripts/llama/llama.py
+++ b/examples/qualcomm/oss_scripts/llama/llama.py
@@ -15,7 +15,6 @@
 from typing import Dict
 
 import torch
-
 from executorch.backends.qualcomm.utils.utils import (
     generate_htp_compiler_spec,
     generate_qnn_executorch_compiler_spec,
@@ -97,17 +96,12 @@ def compile(
     pte_filenames: Dict[str, str],
     tokenizer,
     calibration_data,
+    is_multimodal,
 ):
     os.makedirs(args.artifact, exist_ok=True)
     multi_modal_mgr = MultiModalManager(control_args=args, config=decoder_model_config)
 
     skip_quantize = {}
-    is_multimodal = any(
-        [
-            hasattr(decoder_model_config, VISION_ENCODER),
-            hasattr(decoder_model_config, AUDIO_ENCODER),
-        ]
-    )
 
     # Prepare ptq option and compile spec
     compile_specs = {
@@ -187,20 +181,15 @@ def inference(
     tokenizer,
     chat_template,
     text_decoder_pte_path: str,
-    encoder_pte_path: str,
+    encoder_pte_paths: Dict[str, str],
     tok_embedding_pte_path: str,
     attention_sink_evictor_pte_path: str,
     calibration_data,
+    is_multimodal,
 ):
 
     assert args.model_mode in EVAL_MODE, f"Unknown model_mode: {args.model_mode}."
 
-    is_multimodal = any(
-        [
-            hasattr(decoder_model_config, VISION_ENCODER),
-            hasattr(decoder_model_config, AUDIO_ENCODER),
-        ]
-    )
     pte_paths = {TEXT_DECODER: text_decoder_pte_path}
     eval_results = {
         "pte_size": os.path.getsize(text_decoder_pte_path),
@@ -219,16 +208,21 @@ def inference(
     if is_multimodal:
         eval_results.update(
             {
-                "encoder_pte_size": os.path.getsize(encoder_pte_path),
                 "tok_embedding_pte_size": os.path.getsize(tok_embedding_pte_path),
             }
         )
         pte_paths.update(
             {
-                VISION_ENCODER: encoder_pte_path,
                 TOK_EMBEDDING: tok_embedding_pte_path,
             }
         )
+        for modality, encoder_pte_path in encoder_pte_paths.items():
+            eval_results.update(
+                {f"{modality}_pte_size": os.path.getsize(encoder_pte_path)}
+            )
+            pte_paths.update(
+                {modality: encoder_pte_path},
+            )
 
     if PROMPT_EVAL in args.eval_methods:
         prompt_evaluator = DefaultEval(
@@ -484,6 +478,14 @@ def _build_parser():
         "This setting is for compilation. Once you compile with a chosen <sink_size> and <batch_eviction_size>, they cannot be changed at runtime. If you need to update them, you can recompile the attention sink module along with llama.py.",
     )
 
+    parser.add_argument(
+        "--audio_path",
+        help="Path to the audio file for multimodal language models (MLLM). If not specified, the default audio from encoder/encoder_config.py will be used. The audio should be preprocessed and saved in raw binary format.",
+        default=[],
+        type=str,
+        nargs="+",
+    )
+
     parser.add_argument(
         "--image_path",
         help="Path to the image file for multimodal language models (MLLM). If not specified, the default image from encoder/encoder_config.py will be used. The image should be preprocessed and saved in raw binary format.",
@@ -610,28 +612,39 @@ def export_llama(args) -> None:
     )
     text_decoder_pte_path = f"{args.artifact}/{pte_filenames[TEXT_DECODER]}.pte"
     attention_sink_evictor_pte_path = f"{args.artifact}/{ATTENTION_SINK_EVICTOR}.pte"
-    encoder_pte_path = f"{args.artifact}/{pte_filenames[VISION_ENCODER]}.pte"
     tok_embedding_pte_path = f"{args.artifact}/{pte_filenames[TOK_EMBEDDING]}.pte"
+    encoder_pte_paths = {}
+    for modality in [AUDIO_ENCODER, VISION_ENCODER]:
+        if hasattr(decoder_model_config, modality):
+            encoder_pte_paths[modality] = (
+                f"{args.artifact}/{pte_filenames[modality]}.pte"
+            )
 
+    is_multimodal = any(
+        [
+            hasattr(decoder_model_config, VISION_ENCODER),
+            hasattr(decoder_model_config, AUDIO_ENCODER),
+        ]
+    )
     # TODO: Implement attention sink support for multimodal models (vision/audio).
     assert (
-        not (
-            hasattr(decoder_model_config, VISION_ENCODER)
-            or hasattr(decoder_model_config, AUDIO_ENCODER)
-        )
-    ) or args.use_attention_sink is None, (
-        "Multimodal models currently do not support attention sink feature."
-    )
+        is_multimodal or args.use_attention_sink is None
+    ), "Multimodal models currently do not support attention sink feature."
 
     if args.pre_gen_pte:
         text_decoder_pte_path = f"{args.pre_gen_pte}/{pte_filenames[TEXT_DECODER]}.pte"
         attention_sink_evictor_pte_path = (
             f"{args.pre_gen_pte}/{ATTENTION_SINK_EVICTOR}.pte"
         )
-        encoder_pte_path = f"{args.pre_gen_pte}/{pte_filenames[VISION_ENCODER]}.pte"
         tok_embedding_pte_path = (
             f"{args.pre_gen_pte}/{pte_filenames[TOK_EMBEDDING]}.pte"
         )
+        encoder_pte_paths = {}
+        for modality in [AUDIO_ENCODER, VISION_ENCODER]:
+            if hasattr(decoder_model_config, modality):
+                encoder_pte_paths[modality] = (
+                    f"{args.pre_gen_pte}/{pte_filenames[modality]}.pte"
+                )
 
         if args.use_attention_sink:
             compile_attention_sink_evictor(
@@ -647,10 +660,11 @@ def export_llama(args) -> None:
             tokenizer,
             chat_template,
             text_decoder_pte_path,
-            encoder_pte_path,
+            encoder_pte_paths,
             tok_embedding_pte_path,
             attention_sink_evictor_pte_path,
             calibration_data,
+            is_multimodal,
         )
         print(f"Finish the running pre_gen_pte from {args.pre_gen_pte}")
         return
@@ -661,6 +675,7 @@ def export_llama(args) -> None:
         pte_filenames,
         tokenizer,
         calibration_data,
+        is_multimodal,
     )
     if args.use_attention_sink:
         compile_attention_sink_evictor(
@@ -678,26 +693,21 @@ def export_llama(args) -> None:
             validation_results = {
                 "pte_size": text_decoder_pte_path,
             }
-            if any(
-                [
-                    hasattr(decoder_model_config, VISION_ENCODER),
-                    hasattr(decoder_model_config, AUDIO_ENCODER),
-                ]
-            ):
-                encoder_pte_path = (
-                    f"{args.artifact}/{pte_filenames[VISION_ENCODER]}.pte"
-                )
+            if is_multimodal:
                 tok_embedding_pte_path = (
                     f"{args.artifact}/{pte_filenames[TOK_EMBEDDING]}.pte"
                 )
                 validation_results.update(
                     {
-                        "encoder_pte_size": os.path.getsize(encoder_pte_path),
                         "tok_embedding_pte_size": os.path.getsize(
                             tok_embedding_pte_path
                         ),
                     }
                 )
+                for modality, encoder_pte_path in encoder_pte_paths.items():
+                    validation_results.update(
+                        {f"{modality}_pte_size": os.path.getsize(encoder_pte_path)}
+                    )
 
             with Client((args.ip, args.port)) as conn:
                 conn.send(json.dumps(validation_results))
@@ -712,10 +722,11 @@ def export_llama(args) -> None:
         tokenizer,
         chat_template,
         text_decoder_pte_path,
-        encoder_pte_path,
+        encoder_pte_paths,
         tok_embedding_pte_path,
         attention_sink_evictor_pte_path,
         calibration_data,
+        is_multimodal,
     )
 
 
diff --git a/examples/qualcomm/oss_scripts/llama/model/audio_encoder.py b/examples/qualcomm/oss_scripts/llama/model/audio_encoder.py
new file mode 100644
index 00000000000..64b126e0910
--- /dev/null
+++ b/examples/qualcomm/oss_scripts/llama/model/audio_encoder.py
@@ -0,0 +1,155 @@
+# Copyright (c) Qualcomm Innovation Center, Inc.
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import math
+
+import torch
+import torch.nn.functional as F
+
+from executorch.examples.qualcomm.utils import replace_module_with_custom_class
+from torch import nn
+
+from transformers.models.blip_2.modeling_blip_2 import (
+    Blip2QFormerConfig,
+    Blip2QFormerSelfOutput,
+)
+
+from transformers.models.granite_speech.modeling_granite_speech import (
+    GraniteSpeechConfig,
+    GraniteSpeechConformerAttention,
+    GraniteSpeechCTCEncoder,
+    GraniteSpeechEncoderConfig,
+    GraniteSpeechEncoderProjector,
+)
+
+
+# A `GraniteSpeechConformerAttention` implementation based on Transformers v5.0.0rc1.
+#
+# Reshapes the query_states to avoid 6D tensors Matmul, which are not supported by HTP.
+class CustomGraniteSpeechConformerAttention(GraniteSpeechConformerAttention):
+    def __init__(self, config: GraniteSpeechEncoderConfig):
+        super().__init__(config)
+
+    def forward(
+        self, hidden_states: torch.Tensor, attention_dists: torch.Tensor
+    ) -> torch.Tensor:
+        hidden_states = self.pre_norm(hidden_states)
+        bsz, num_features, _ = hidden_states.shape
+
+        num_blocks = math.ceil(num_features / self.context_size)
+        remainder = num_features % self.context_size
+        if remainder > 0:
+            # right padding to reach block size
+            hidden_states = torch.nn.functional.pad(
+                hidden_states, (0, 0, 0, self.context_size - remainder)
+            )
+
+        query_states = self.to_q(hidden_states)
+        key_states, value_states = self.to_kv(hidden_states).chunk(2, dim=-1)
+
+        query_states = query_states.reshape(
+            bsz, num_blocks, self.context_size, self.num_heads, -1
+        ).transpose(2, 3)
+        key_states = key_states.reshape(
+            bsz, num_blocks, self.context_size, self.num_heads, -1
+        ).transpose(2, 3)
+        value_states = value_states.reshape(
+            bsz, num_blocks, self.context_size, self.num_heads, -1
+        ).transpose(2, 3)
+
+        # ========================== Qualcomm Changed: Pre-merge dimensions to avoid 6D tensor matmul ==========================
+        rel_pos_emb = self.rel_pos_emb(attention_dists)
+        b, m, h, c, d = query_states.shape
+        c, r, d = rel_pos_emb.shape
+        rel = rel_pos_emb.transpose(-1, -2)  # [c, d, r]
+        q = query_states.reshape(-1, c, d)  # [b*m*h, c, d]
+        out = (
+            torch.einsum("b c d, c d r -> b c r", q, rel) * self.scale
+        )  # [b*m*h, c, r]
+        pos_attn = out.view(b, m, h, c, r)
+        # ======================================================================================================================
+
+        if remainder > 0:
+            # masked attention in the extended block
+            mask = torch.ones(
+                self.context_size,
+                self.context_size,
+                dtype=bool,
+                device=hidden_states.device,
+            )
+            mask[:remainder, :remainder] = 0
+            mask_value = -torch.finfo(pos_attn.dtype).max
+            pos_attn[:, -1, :].masked_fill_(mask, mask_value)
+
+        with torch.nn.attention.sdpa_kernel(torch.nn.attention.SDPBackend.MATH):
+            out = F.scaled_dot_product_attention(
+                query_states,
+                key_states,
+                value_states,
+                attn_mask=pos_attn,
+                scale=self.scale,
+            )
+        out = out.transpose(2, 3).reshape(bsz, hidden_states.shape[1], -1)
+        out = self.to_out(out[:, :num_features, :])
+        return self.dropout(out)
+
+
+# Custom implementation based on `transformers.models.blip_2.modeling_blip_2.Blip2QFormerSelfOutput` (Transformers v5.0.0rc1).
+#
+# Workaround:
+# Adds an identity matrix computation before LayerNorm as a workaround for an
+# HTP preparation failure.
+class CustomBlip2QFormerSelfOutput(Blip2QFormerSelfOutput):
+    def __init__(self, config: Blip2QFormerConfig):
+        super().__init__(config=config)
+        self.identity = nn.Linear(config.hidden_size, config.hidden_size)
+        self.identity.weight.data.copy_(torch.eye(config.hidden_size))
+        self.identity.bias.data.zero_()
+
+    def forward(
+        self, hidden_states: torch.Tensor, input_tensor: torch.Tensor
+    ) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = hidden_states + input_tensor
+        # Workaround for HTP preparation failure: insert an identity matrix
+        # to break pattern match.
+        hidden_states = self.identity(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states)
+        return hidden_states
+
+
+class GraniteSpeechCTCEncoderWrapper(nn.Module):
+    def __init__(self, config: GraniteSpeechConfig):
+        super().__init__()
+        self.encoder = GraniteSpeechCTCEncoder(config.encoder_config)
+        self.projector = GraniteSpeechEncoderProjector(config)
+
+        replace_module_with_custom_class(
+            self.encoder,
+            target_class=GraniteSpeechConformerAttention,
+            custom_class=CustomGraniteSpeechConformerAttention,
+            strict=True,
+            extra_custom_kwargs={"config": config.encoder_config},
+        )
+
+        replace_module_with_custom_class(
+            self.projector,
+            target_class=Blip2QFormerSelfOutput,
+            custom_class=CustomBlip2QFormerSelfOutput,
+            strict=False,  # Set to False because the custom class adds an 'identity' matrix not present in the original QFormer.
+            extra_custom_kwargs={"config": config.projector_config},
+        )
+
+        self.config = config
+
+    def get_example_inputs(self):
+        return (torch.randn((1, 844, 160), dtype=torch.float32),)
+
+    def forward(self, hidden_states: torch.Tensor):
+        encoder_embeds = self.encoder(hidden_states)
+        projected_embeds = self.projector(encoder_embeds)
+        return projected_embeds
diff --git a/examples/qualcomm/oss_scripts/llama/model/static_llama.py b/examples/qualcomm/oss_scripts/llama/model/static_llama.py
index 3ccee2d7749..d57ada7fef6 100755
--- a/examples/qualcomm/oss_scripts/llama/model/static_llama.py
+++ b/examples/qualcomm/oss_scripts/llama/model/static_llama.py
@@ -840,10 +840,12 @@ def __init__(
             output_new_cache_only=output_new_cache_only,
             output_cache=output_cache,
             use_i64_token=use_i64_token,
+            **kwargs,
         )
 
-        # Set the image token ID from keyword arguments. It defaults to None if not provided.
+        # Set the audio/image token ID from keyword arguments. It defaults to None if not provided.
         # If an ID is provided, it will be stored in the model's metadata.
+        self.audio_token_id = kwargs.get("audio_token_id", None)
         self.image_token_id = kwargs.get("image_token_id", None)
 
     def forward(
@@ -863,6 +865,8 @@ def forward(
             self.freqs_sin[input_pos][0] if self.use_kv_cache else self.freqs_sin
         )
 
+        hidden_states = self.embedding_scale_factor * hidden_states
+
         for ind, decoder_layer in enumerate(self.layers):
             k_caches = None
             v_caches = None
@@ -886,6 +890,9 @@ def forward(
         hidden_states = self.norm(hidden_states)
         logits = self.output(hidden_states)
 
+        if self.logits_scaling:
+            logits = logits / self.logits_scaling
+
         if self.output_cache:
             return logits, output_k_cache, output_v_cache
         return logits
@@ -934,6 +941,8 @@ def get_example_inputs(self):
 
     def get_metadata(self):
         meta_data = super().get_metadata()
+        if self.audio_token_id:
+            meta_data["audio_token_id"] = self.audio_token_id
         if self.image_token_id:
             meta_data["image_token_id"] = self.image_token_id
         return meta_data
diff --git a/examples/qualcomm/oss_scripts/llama/qnn_multimodal_runner.cpp b/examples/qualcomm/oss_scripts/llama/qnn_multimodal_runner.cpp
index f6379d9243d..1a7da5d62d3 100644
--- a/examples/qualcomm/oss_scripts/llama/qnn_multimodal_runner.cpp
+++ b/examples/qualcomm/oss_scripts/llama/qnn_multimodal_runner.cpp
@@ -9,7 +9,7 @@
 /**
  * @file
  *
- * This tool can run SmolVLM 500M, InternVL3 1B
+ * This tool can run SmolVLM 500M, InternVL3 1B, GraniteSpeech3.3 2B
  * with Qualcomm AI Engine Direct.
  *
  */
@@ -19,6 +19,7 @@
 #include <executorch/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/encoder.h>
 #include <executorch/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/multimodal_runner.h>
 #include <executorch/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/utils.h>
+#include <executorch/extension/llm/runner/audio.h>
 #include <executorch/extension/llm/runner/image.h>
 #include <executorch/extension/llm/runner/irunner.h>
 #include <executorch/extension/llm/runner/multimodal_input.h>
@@ -29,7 +30,9 @@
 #include <vector>
 
 using executorch::aten::ScalarType;
+using executorch::extension::llm::Audio;
 using executorch::extension::llm::Image;
+using ::executorch::extension::llm::make_audio_input;
 using ::executorch::extension::llm::make_image_input;
 using ::executorch::extension::llm::make_text_input;
 using executorch::extension::llm::MultimodalInput;
@@ -75,10 +78,14 @@ DEFINE_string(
     tokenized_prompt,
     "",
     "This is an alternative of passing prompts. Users could provide this in a raw file, with tokens saved in uint64 format.");
+DEFINE_string(
+    audio_path,
+    "",
+    "Path to input audio file. For Audio-Language models, please specify audio path.");
 DEFINE_string(
     image_path,
     "",
-    "Path to input image file. If empty, text-only mode is used.");
+    "Path to input image file. For Vision-Language models, please specify image path.");
 DEFINE_string(system_prompt, "", "System prompt for the model.");
 
 // Generation parameters
@@ -160,12 +167,6 @@ void start_multimodal_runner(
 
   auto model_version = runner.get_model_version().get();
 
-  if (modality_of(model_version) == example::Modality::kVision) {
-    ET_CHECK_MSG(
-        !FLAGS_image_path.empty(),
-        "For VLM models, please specify image path.");
-  }
-
   // Prepare output buffer (similar to qnn_llama_runner.cpp)
   std::vector<char> buf;
   buf.reserve(5 * FLAGS_seq_len); // assume each token is around 5 char
@@ -185,12 +186,23 @@ void start_multimodal_runner(
       0,
       0};
 
-  // 1. [Multi-modality] Get raw files from input_list.txt
-  std::vector<std::string> raw_files =
-      example::load_raw_files(FLAGS_image_path.c_str());
+  // 1. [Multimodal] Get raw files from input_list.txt
+  std::vector<std::string> audio_raw_files;
+  std::vector<std::string> image_raw_files;
+  if (modality_of(model_version) == example::Modality::kAudio) {
+    ET_CHECK_MSG(
+        !FLAGS_audio_path.empty(), "For ALM, please specify audio path.");
+    audio_raw_files = example::load_raw_files(FLAGS_audio_path.c_str());
+  }
+  if (modality_of(model_version) == example::Modality::kVision) {
+    ET_CHECK_MSG(
+        !FLAGS_image_path.empty(), "For VLM, please specify image path.");
+    image_raw_files = example::load_raw_files(FLAGS_image_path.c_str());
+  }
 
   // 2. Prepare messages for multi-turn simulation
-  std::vector<Message> messages = prepare_messages(prompts, raw_files);
+  std::vector<Message> messages =
+      prepare_messages(prompts, image_raw_files, audio_raw_files);
 
   // 3. Get expected input size/dtype for encoder
   Result<MethodMeta> method_meta = runner.get_encoder_method_meta();
@@ -207,8 +219,17 @@ void start_multimodal_runner(
       const auto& prompt = messages[j].text;
       const std::vector<std::string> files_path = messages[j].files_path;
 
-      // 4.1 prepare image input
+      // 4.1 prepare modality input
       std::vector<MultimodalInput> inputs;
+      // 4.1.1 prepare audio input
+      if (modality_of(model_version) == example::Modality::kAudio) {
+        for (const std::string& file_path : files_path) {
+          Audio audio;
+          example::load_audio(file_path, audio, expected_size, expected_dtype);
+          inputs.emplace_back(make_audio_input(audio));
+        }
+      }
+      // 4.1.2 prepare image input
       if (modality_of(model_version) == example::Modality::kVision) {
         for (const std::string& file_path : files_path) {
           Image image;
@@ -217,12 +238,15 @@ void start_multimodal_runner(
         }
       }
 
-      // 4.2 prepare prompt input
+      // 4.2 apply chat template for text input
       std::string formatted_prompt =
           apply_chat_template(prompt, FLAGS_system_prompt, model_version);
       inputs.emplace_back(make_text_input(formatted_prompt));
 
-      // 4.3 generate text
+      // 4.3 dispatch inputs into correct order
+      inputs = dispatch_inputs(inputs, formatted_prompt);
+
+      // 4.4 generate text
       runner.generate(inputs, config, callback);
     }
   }
diff --git a/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/audio_chat_template.h b/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/audio_chat_template.h
new file mode 100644
index 00000000000..8af08de3a69
--- /dev/null
+++ b/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/audio_chat_template.h
@@ -0,0 +1,95 @@
+/*
+ * Copyright (c) Qualcomm Innovation Center, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <executorch/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/multimodal_runner.h>
+#include <executorch/runtime/platform/log.h>
+#include <string>
+#include <vector>
+
+inline const std::string AUDIO_TOKEN = "<audio>";
+
+struct AudioSpecialTokens {
+  std::string audio_token;
+};
+
+/**
+ * Get special tokens based on audio model version
+ */
+inline AudioSpecialTokens get_special_tokens(
+    example::AudioLanguageModel model_version) {
+  AudioSpecialTokens tokens;
+  tokens.audio_token = AUDIO_TOKEN;
+
+  switch (model_version) {
+    case example::AudioLanguageModel::kGraniteSpeech:
+      break;
+    default:
+      break;
+  }
+
+  return tokens;
+}
+
+/**
+ * Expand audio tokens in prompt with model-specific wrapping tokens
+ * Replaces each <audio> token with the full format including special wrapper
+ * tokens
+ */
+inline std::string expand_audio_tokens(
+    const std::string& prompt,
+    const AudioSpecialTokens& specials) {
+  // Create audio prompt with repeated audio tokens
+  std::string audio_prompt = specials.audio_token;
+
+  // Replace single audio token with expanded version
+  size_t pos = 0;
+  std::string expanded = prompt;
+  while ((pos = expanded.find(specials.audio_token, pos)) !=
+         std::string::npos) {
+    expanded.replace(pos, specials.audio_token.size(), audio_prompt);
+    pos += audio_prompt.size();
+  }
+  ET_LOG(Info, "Prompt after expanding audio token: %s", expanded.c_str());
+
+  return expanded;
+}
+
+/**
+ * Format prompt based on model version with multimodal token expansion
+ */
+inline std::string apply_chat_template(
+    const std::string& system_prompt,
+    const std::string& prompt,
+    example::AudioLanguageModel model_version) {
+  std::string formatted_prompt;
+  AudioSpecialTokens specials = get_special_tokens(model_version);
+
+  switch (model_version) {
+    case example::AudioLanguageModel::kGraniteSpeech: {
+      formatted_prompt.append("<|start_of_role|>system<|end_of_role|>");
+      if (!system_prompt.empty()) {
+        formatted_prompt.append(system_prompt);
+      } else {
+        formatted_prompt.append(
+            "You are Granite, developed by IBM. You are a helpful AI assistant.");
+      }
+      formatted_prompt.append("<|end_of_text|>\n");
+      formatted_prompt.append("<|start_of_role|>user<|end_of_role|>");
+      formatted_prompt.append(expand_audio_tokens(prompt, specials));
+      formatted_prompt.append(
+          "<|end_of_text|>\n<|start_of_role|>assistant<|end_of_role|>");
+      break;
+    }
+    default:
+      ET_CHECK_MSG(false, "unsupported Audio-Language model version");
+      break;
+  }
+  return formatted_prompt;
+}
diff --git a/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/chat_template.h b/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/chat_template.h
index 1b8fb57408d..d6c629d79e7 100644
--- a/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/chat_template.h
+++ b/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/chat_template.h
@@ -8,12 +8,15 @@
 
 #pragma once
 
+#include <executorch/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/audio_chat_template.h>
 #include <executorch/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/multimodal_runner.h>
 #include <executorch/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/vision_chat_template.h>
 #include <executorch/runtime/platform/log.h>
 #include <string>
 #include <vector>
 
+using executorch::extension::llm::MultimodalInput;
+
 /**
  * Message structure for multi-turn conversations
  */
@@ -31,29 +34,46 @@ struct Message {
  */
 inline std::vector<Message> prepare_messages(
     std::vector<std::string>& prompts,
-    const std::vector<std::string>& image_paths) {
+    const std::vector<std::string>& image_paths,
+    const std::vector<std::string>& audio_paths) {
   size_t num_images = image_paths.size();
+  size_t num_audios = audio_paths.size();
   size_t total_image_tokens = 0;
+  size_t total_audio_tokens = 0;
 
-  // Count total image tokens across all prompts
+  // Count total image and audio tokens across all prompts
   for (const auto& prompt : prompts) {
     size_t pos = 0;
     while ((pos = prompt.find(IMG_TOKEN, pos)) != std::string::npos) {
       total_image_tokens++;
       pos += IMG_TOKEN.length();
     }
+    pos = 0;
+    while ((pos = prompt.find(AUDIO_TOKEN, pos)) != std::string::npos) {
+      total_audio_tokens++;
+      pos += AUDIO_TOKEN.length();
+    }
   }
 
-  // If no image tokens but images provided, prepend image tokens to prompt in
-  // first turn and check the number of image tokens given by user are equal to
-  // image num.
+  auto repeat_token = [](const std::string& token, size_t n) -> std::string {
+    std::string result;
+    result.reserve(n * token.size());
+    for (size_t i = 0; i < n; ++i)
+      result += token;
+    return result;
+  };
+
+  // Prepend tokens if paths are provided but tokens are missing
   if (total_image_tokens == 0 && num_images > 0) {
-    std::string prefix;
-    for (size_t i = 0; i < num_images; ++i) {
-      prefix += IMG_TOKEN;
-    }
-    prompts[0] = prefix + prompts[0];
+    prompts[0] = repeat_token(IMG_TOKEN, num_images) + prompts[0];
+    total_image_tokens = num_images; // Update count
+  }
+  if (total_audio_tokens == 0 && num_audios > 0) {
+    prompts[0] = repeat_token(AUDIO_TOKEN, num_audios) + prompts[0];
+    total_audio_tokens = num_audios; // Update count
   }
+
+  // Validate token counts against provided paths
   ET_CHECK_MSG(
       total_image_tokens == num_images,
       "Number of %s tokens (%zu) does not match number of images (%zu). Please check your prompts and image paths.",
@@ -61,38 +81,71 @@ inline std::vector<Message> prepare_messages(
       total_image_tokens,
       num_images);
 
-  // Build messages and dispatch images
+  ET_CHECK_MSG(
+      total_audio_tokens == num_audios,
+      "Number of %s tokens (%zu) does not match number of audios (%zu). Please check your prompts and audio paths.",
+      AUDIO_TOKEN.c_str(),
+      total_audio_tokens,
+      num_audios);
+
+  // Build messages and dispatch images/audios.
+  // A model may support both vision and audio modalities simultaneously (e.g.,
+  // omni models). Files are dispatched in prompt order: for each turn, we scan
+  // for IMG_TOKEN and AUDIO_TOKEN tokens and assign the next available
+  // image/audio path respectively, preserving interleaved ordering.
   std::vector<Message> messages;
   size_t img_idx = 0;
+  size_t audio_idx = 0;
   ET_LOG(Info, "Simulation multi-turn:");
-
   for (size_t i = 0; i < prompts.size(); ++i) {
     Message msg;
     msg.id = i;
     msg.text = prompts[i];
+    std::string& current_prompt = msg.text;
 
-    // Count image tokens in this prompt
-    size_t count = 0;
+    // Collect positions of each token type separately
+    std::vector<size_t> audio_positions, img_positions;
     size_t pos = 0;
-    while ((pos = msg.text.find(IMG_TOKEN, pos)) != std::string::npos) {
-      count++;
+    while ((pos = current_prompt.find(AUDIO_TOKEN, pos)) != std::string::npos) {
+      audio_positions.push_back(pos);
+      pos += AUDIO_TOKEN.length();
+    }
+    pos = 0;
+    while ((pos = current_prompt.find(IMG_TOKEN, pos)) != std::string::npos) {
+      img_positions.push_back(pos);
       pos += IMG_TOKEN.length();
     }
 
-    // Assign corresponding images to this message
-    if (count > 0) {
-      for (size_t k = 0; k < count && img_idx < image_paths.size(); ++k) {
-        msg.files_path.emplace_back(image_paths[img_idx++]);
+    // Merge into (position, Modality) and sort by position
+    std::vector<std::pair<size_t, example::Modality>>
+        ordered_modality_token_ids;
+    ordered_modality_token_ids.reserve(
+        img_positions.size() + audio_positions.size());
+    for (size_t p : audio_positions) {
+      ordered_modality_token_ids.emplace_back(p, example::Modality::kAudio);
+    }
+    for (size_t p : img_positions) {
+      ordered_modality_token_ids.emplace_back(p, example::Modality::kVision);
+    }
+    std::sort(
+        ordered_modality_token_ids.begin(), ordered_modality_token_ids.end());
+
+    // Push file paths in order
+    for (const auto& [_, modality] : ordered_modality_token_ids) {
+      if (modality == example::Modality::kAudio) {
+        msg.files_path.push_back(audio_paths[audio_idx++]);
+      } else if (modality == example::Modality::kVision) {
+        msg.files_path.push_back(image_paths[img_idx++]);
       }
     }
 
     // Log message info
     std::string paths_str = "[";
-    for (size_t i = 0; i < msg.files_path.size(); ++i) {
+    for (size_t j = 0; j < msg.files_path.size(); ++j) {
       paths_str += "'";
-      paths_str += msg.files_path[i];
+      paths_str += msg.files_path[j];
       paths_str += "'";
-      if (i < msg.files_path.size() - 1)
+      if (j < msg.files_path.size() - 1)
         paths_str += ", ";
     }
     paths_str += "]";
@@ -120,3 +173,69 @@ inline std::string apply_chat_template(
       },
       model_version);
 }
+
+std::vector<MultimodalInput> dispatch_inputs(
+    const std::vector<MultimodalInput>& inputs,
+    const std::string& formatted_prompt) {
+  // Dispatch a formatted prompt into text and multimodal inputs at each
+  // placeholder token position.
+  //
+  // VLM example (SmolVLM):
+  //   inputs:  [cat.jpg (image)]
+  //   prompt:  "<|im_start|>User:<fake_token_around_image><global-img><image>"
+  //            "<fake_token_around_image>Can you describe this image?
+  //            "<end_of_utterance>\nAssistant:"
+  //   returns: ["<|im_start|>User:<fake_token_around_image><global-img>",
+  //             cat.jpg,
+  //             "<fake_token_around_image>Can you describe this image?
+  //             "<end_of_utterance>\nAssistant:"]
+  //
+  // ALM example (Granite Speech):
+  //   inputs:  [speech.wav (audio)]
+  //   prompt:  "<|start_of_role|>user<|end_of_role|><audio>can you transcribe
+  //   the speech into a written format?"
+  //            "<|end_of_text|>\n<|start_of_role|>assistant<|end_of_role|>"
+  //   returns: ["<|start_of_role|>user<|end_of_role|>",
+  //             speech.wav,
+  //             "can you transcribe the speech into a written format?
+  //             "<|end_of_text|>\n<|start_of_role|>assistant<|end_of_role|>"]
+  std::vector<MultimodalInput> dispatched_inputs;
+  size_t prompt_pos = 0;
+
+  for (const auto& input : inputs) {
+    if (input.is_image()) {
+      size_t img_token_pos = formatted_prompt.find(IMG_TOKEN, prompt_pos);
+      if (img_token_pos != std::string::npos) {
+        // Add text before the image token
+        if (img_token_pos > prompt_pos) {
+          dispatched_inputs.emplace_back(
+              formatted_prompt.substr(prompt_pos, img_token_pos - prompt_pos));
+        }
+        // Add the image input
+        dispatched_inputs.emplace_back(input);
+        // Move position over the image token
+        prompt_pos = img_token_pos + IMG_TOKEN.length();
+      }
+    } else if (input.is_audio()) {
+      size_t audio_token_pos = formatted_prompt.find(AUDIO_TOKEN, prompt_pos);
+      if (audio_token_pos != std::string::npos) {
+        // Add text before the audio token
+        if (audio_token_pos > prompt_pos) {
+          dispatched_inputs.emplace_back(formatted_prompt.substr(
+              prompt_pos, audio_token_pos - prompt_pos));
+        }
+        // Add the audio input
+        dispatched_inputs.emplace_back(input);
+        // Move position over the audio token
+        prompt_pos = audio_token_pos + AUDIO_TOKEN.length();
+      }
+    }
+  }
+
+  // Add any remaining text after the last token
+  if (prompt_pos < formatted_prompt.length()) {
+    dispatched_inputs.emplace_back(formatted_prompt.substr(prompt_pos));
+  }
+
+  return dispatched_inputs;
+}
diff --git a/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/encoder.cpp b/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/encoder.cpp
index 9304d2e4688..753ab83ab38 100644
--- a/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/encoder.cpp
+++ b/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/encoder.cpp
@@ -51,7 +51,7 @@ Result<Tensor> EncoderRunner::encode(TensorPtr& image_tensor) {
   ET_CHECK_MSG(is_method_loaded(), "Encoder method not loaded");
 
   auto tensor_ptr = image_tensor.get();
-  ET_LOG(Info, "Encoding image tensor with numel: %zu", tensor_ptr->numel());
+  ET_LOG(Info, "Encoding tensor with numel: %zu", tensor_ptr->numel());
 
   std::vector<executorch::runtime::EValue> encoder_inputs;
   encoder_inputs.emplace_back(*tensor_ptr);
@@ -59,11 +59,10 @@ Result<Tensor> EncoderRunner::encode(TensorPtr& image_tensor) {
   auto encoder_result = module_->forward(encoder_inputs);
   ET_CHECK_MSG(encoder_result.ok(), "Encoder execution failed");
 
-  auto encoder_output = encoder_result.get();
-  auto image_hidden_states = encoder_output[0].toTensor();
-  ET_LOG(Info, "Encoder execution completed, got image hidden states");
+  encoder_output_ = std::move(encoder_result.get());
+  ET_LOG(Info, "Encoder execution completed, got hidden states");
 
-  return image_hidden_states;
+  return encoder_output_[0].toTensor();
 }
 
 } // namespace example
diff --git a/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/encoder.h b/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/encoder.h
index e8c8a948877..2fe83c97bb2 100644
--- a/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/encoder.h
+++ b/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/encoder.h
@@ -44,24 +44,17 @@ class EncoderRunner {
   executorch::runtime::Error load();
 
   /**
-   * @brief Get the image sequence length from encoder output metadata
-   * @return Image sequence length
-   */
-  int32_t get_image_seq_len() const;
-
-  /**
-   * @brief Encode image tensor to hidden states
-   * @param image_tensor Input image tensor (B, C, H, W)
-   * @return Result containing the image hidden states tensor
+   * @brief Encode input tensor to hidden states
+   * @param input_tensor Input tensor
+   * @return Result containing the hidden states tensor
    */
   executorch::runtime::Result<executorch::aten::Tensor> encode(
-      executorch::extension::TensorPtr& image_tensor);
+      executorch::extension::TensorPtr& input_tensor);
 
  private:
   executorch::extension::Module* module_;
   inline static const std::string kEncoderForwardName = "forward";
-  std::list<std::vector<float>> output_buffers_;
-  std::list<executorch::extension::TensorPtr> output_tensors_;
+  std::vector<executorch::runtime::EValue> encoder_output_;
 };
 
 } // namespace example
diff --git a/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/multimodal_embedding_merger.cpp b/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/multimodal_embedding_merger.cpp
index d45ce10a9af..6a0bfa7341a 100644
--- a/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/multimodal_embedding_merger.cpp
+++ b/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/multimodal_embedding_merger.cpp
@@ -17,187 +17,75 @@ MultimodalEmbeddingMerger::MultimodalEmbeddingMerger(int32_t embedding_dim)
 }
 
 void MultimodalEmbeddingMerger::reset() {
-  text_embedding_buffers_.clear();
-  text_embedding_token_counts_.clear();
-  image_embedding_buffers_.clear();
-  image_embedding_token_counts_.clear();
+  embeddings_.clear();
   total_tokens_ = 0;
 }
 
-void MultimodalEmbeddingMerger::add_embeddings(
-    const executorch::aten::Tensor& embeddings,
+void MultimodalEmbeddingMerger::append_data(
     const float* data,
-    EmbeddingType type) {
-  // shape: [1, num_tokens, embedding_dim]
-  ET_CHECK_MSG(embeddings.dim() == 3, "Embeddings must be a 3D tensor");
-
-  size_t batch_size = embeddings.sizes()[0];
-  size_t num_tokens = embeddings.sizes()[1];
-  size_t dim = embeddings.sizes()[2];
-
-  ET_CHECK_MSG(batch_size == 1, "Batch size must be 1");
+    ssize_t ndim,
+    const executorch::aten::SizesType* sizes) {
+  ET_CHECK_MSG(ndim == 3, "Embeddings must be a 3D tensor");
+  ET_CHECK_MSG(sizes[0] == 1, "Batch size must be 1");
+  const int32_t num_tokens = static_cast<int32_t>(sizes[1]);
+  const int32_t dim = static_cast<int32_t>(sizes[2]);
   ET_CHECK_MSG(
       dim == embedding_dim_,
-      "Embedding dimension mismatch: expected %zu, got %zu",
+      "Embedding dimension mismatch: expected %d, got %d",
       embedding_dim_,
       dim);
-
-  // Copy embedding data to prevent it from being overwritten
-  size_t num_elements = num_tokens * dim;
-  std::vector<float> buffer(data, data + num_elements);
-
-  std::string type_str = (type == EmbeddingType::kText) ? "text" : "image";
-  if (type == EmbeddingType::kText) {
-    text_embedding_buffers_.emplace_back(std::move(buffer));
-    text_embedding_token_counts_.push_back(num_tokens);
-  } else {
-    image_embedding_buffers_.emplace_back(std::move(buffer));
-    image_embedding_token_counts_.push_back(num_tokens);
-  }
-
-  ET_LOG(
-      Info,
-      "Added %s embeddings: num_tokens=%zu",
-      type_str.c_str(),
-      num_tokens);
+  const size_t num_elements =
+      static_cast<size_t>(num_tokens) * static_cast<size_t>(embedding_dim_);
+  embeddings_.insert(embeddings_.end(), data, data + num_elements);
+  total_tokens_ += num_tokens;
 }
 
-void MultimodalEmbeddingMerger::add_text_embeddings(
-    const TensorStruct<float>& text_embeddings) {
-  ET_CHECK_MSG(
-      text_embeddings.tensor != nullptr,
-      "Text embeddings tensor cannot be null");
-  ET_CHECK_MSG(
-      text_embeddings.data != nullptr, "Text embeddings data cannot be null");
-
-  executorch::aten::Tensor tensor_wrapper(text_embeddings.tensor.get());
-
-  add_embeddings(tensor_wrapper, text_embeddings.data, EmbeddingType::kText);
+void MultimodalEmbeddingMerger::add_embeddings(
+    const TensorStruct<float>& embedding) {
+  ET_CHECK_MSG(embedding.tensor != nullptr, "Embedding tensor cannot be null");
+  ET_CHECK_MSG(embedding.data != nullptr, "Embedding data cannot be null");
+  append_data(
+      embedding.data,
+      embedding.tensor->dim(),
+      embedding.tensor->sizes().data());
+  ET_LOG(Info, "Merged TensorStruct embedding: total_tokens=%d", total_tokens_);
 }
 
-void MultimodalEmbeddingMerger::add_image_embeddings(
-    const executorch::aten::Tensor& image_embeddings) {
-  add_embeddings(
-      image_embeddings,
-      image_embeddings.const_data_ptr<float>(),
-      EmbeddingType::kImage);
+void MultimodalEmbeddingMerger::add_embeddings(
+    const executorch::aten::Tensor& embedding) {
+  append_data(
+      embedding.const_data_ptr<float>(),
+      embedding.dim(),
+      embedding.sizes().data());
+  ET_LOG(Info, "Merged Tensor embedding: total_tokens=%d", total_tokens_);
 }
 
-TensorStruct<float> MultimodalEmbeddingMerger::merge(
-    const std::vector<uint64_t>& input_ids,
-    uint64_t image_token_id) {
-  ET_CHECK_MSG(!input_ids.empty(), "input_ids cannot be empty");
+TensorStruct<float> MultimodalEmbeddingMerger::get_merged_embeddings() {
   ET_CHECK_MSG(
-      !text_embedding_buffers_.empty(),
-      "No text embeddings added. Call add_text_embeddings() first.");
-
-  // Final merged embeddings
-  std::vector<float> merged_buffer;
-  std::vector<executorch::aten::TensorImpl::SizesType> sizes;
-  TensorStruct<float> merged_embeddings;
-
-  size_t num_placeholder_tokens = 0;
-  if (image_token_id != 0) {
-    for (uint64_t token_id : input_ids) {
-      if (token_id == image_token_id) {
-        num_placeholder_tokens++;
-      }
-    }
-  }
-
-  ET_CHECK_MSG(
-      num_placeholder_tokens == image_embedding_buffers_.size(),
-      "Number of placeholder tokens (%zu) must match number of image embeddings (%zu)",
-      num_placeholder_tokens,
-      image_embedding_buffers_.size());
-
-  // Calculate total tokens: sum of all text tokens + all image tokens
-  for (int64_t count : text_embedding_token_counts_) {
-    total_tokens_ += count;
-  }
-  for (int64_t count : image_embedding_token_counts_) {
-    total_tokens_ += count;
-  }
-  total_tokens_ = total_tokens_ - num_placeholder_tokens;
-
-  size_t total_elements = total_tokens_ * embedding_dim_;
-  merged_buffer.resize(total_elements);
-
-  // Merge embeddings based on input_ids
-  size_t text_emb_idx = 0; // Which text embedding chunk in current turn
-  size_t text_token_idx = 0; // Token index within current text embedding chunk
-  size_t image_emb_idx = 0; // Which image embedding chunk in current turn
-  size_t output_offset = 0; // Output buffer offset
-
-  for (int i = 0; i < input_ids.size(); i++) {
-    uint64_t token_id = input_ids[i];
-
-    if (image_token_id != 0 && token_id == image_token_id) {
-      // Insert entire image embedding
-      ET_CHECK_MSG(
-          image_emb_idx < image_embedding_buffers_.size(),
-          "Image index out of bounds");
-
-      const std::vector<float>& image_buffer =
-          image_embedding_buffers_[image_emb_idx];
-      int64_t num_image_tokens = image_embedding_token_counts_[image_emb_idx];
-
-      size_t num_elements = num_image_tokens * embedding_dim_;
-      std::memcpy(
-          merged_buffer.data() + output_offset,
-          image_buffer.data(),
-          num_elements * sizeof(float));
-
-      output_offset += num_elements;
-      image_emb_idx++;
-      text_token_idx++; // Skip this image placeholder token
-    } else {
-      // Insert one text token embedding
-      ET_CHECK_MSG(
-          text_emb_idx < text_embedding_buffers_.size(),
-          "Text embedding index out of bounds");
-
-      const std::vector<float>& text_buffer =
-          text_embedding_buffers_[text_emb_idx];
-      std::memcpy(
-          merged_buffer.data() + output_offset,
-          text_buffer.data() + text_token_idx * embedding_dim_,
-          embedding_dim_ * sizeof(float));
-
-      output_offset += embedding_dim_;
-      text_token_idx++;
-    }
-  }
-
-  ET_CHECK_MSG(
-      image_emb_idx == image_embedding_buffers_.size(),
-      "Not all image embeddings were used: used %zu, expected %zu",
-      image_emb_idx,
-      image_embedding_buffers_.size());
-
-  // Setup tensor metadata
-  merged_embeddings.data = merged_buffer.data();
-  merged_embeddings.size = total_elements * sizeof(float);
-
-  // Setup sizes and dim_order: [1, total_tokens, embedding_dim]
-  sizes = {1, total_tokens_, embedding_dim_};
-
-  // Create TensorImpl
-  merged_embeddings.tensor = std::make_unique<executorch::aten::TensorImpl>(
+      !embeddings_.empty(),
+      "No embeddings to return. Call add_embeddings() first.");
+
+  sizes_ = {
+      static_cast<executorch::aten::TensorImpl::SizesType>(1),
+      static_cast<executorch::aten::TensorImpl::SizesType>(total_tokens_),
+      static_cast<executorch::aten::TensorImpl::SizesType>(embedding_dim_)};
+
+  TensorStruct<float> result;
+  result.data = embeddings_.data();
+  result.size = embeddings_.size() * sizeof(float);
+  result.tensor = std::make_unique<executorch::aten::TensorImpl>(
       executorch::aten::ScalarType::Float,
-      sizes.size(),
-      sizes.data(),
-      merged_embeddings.data);
+      sizes_.size(),
+      sizes_.data(),
+      result.data);
 
   ET_LOG(
       Info,
-      "Merged embeddings: total_tokens=%d, text=%zu, images=%zu, embedding_dim=%d",
+      "Get merged embeddings: total_tokens=%d, embedding_dim=%d",
       total_tokens_,
-      text_embedding_buffers_.size(),
-      image_embedding_buffers_.size(),
       embedding_dim_);
 
-  return merged_embeddings;
+  return result;
 }
 
 } // namespace example
diff --git a/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/multimodal_embedding_merger.h b/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/multimodal_embedding_merger.h
index f545a80a354..d8a7616722c 100644
--- a/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/multimodal_embedding_merger.h
+++ b/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/multimodal_embedding_merger.h
@@ -11,30 +11,14 @@
 #include <executorch/examples/qualcomm/oss_scripts/llama/runner/utils.h>
 #include <executorch/runtime/core/exec_aten/exec_aten.h>
 
+#include <array>
 #include <memory>
-#include <string>
 #include <vector>
 
 namespace example {
 
-/**
- * @class MultimodalEmbeddingMerger
- * @brief Merges text and image embeddings based on token IDs
- *
- * This class collects text and image embeddings separately, then merges them
- * based on input token IDs. When a placeholder token ID is encountered,
- * it inserts the corresponding image embedding. Otherwise, it inserts the text
- * embedding for that token position.
- */
-enum class EmbeddingType { kText, kImage };
-
 class MultimodalEmbeddingMerger {
  public:
-  /**
-   * @brief Construct a new Multimodal Embedding Merger
-   *
-   * @param embedding_dim Expected embedding dimension for all inputs
-   */
   explicit MultimodalEmbeddingMerger(int32_t embedding_dim);
 
   /**
@@ -42,65 +26,28 @@ class MultimodalEmbeddingMerger {
    */
   void reset();
 
-  /**
-   * @brief Add text embeddings to the collection
-   *
-   * @param text_embeddings Text embedding tensor [1, num_tokens, embedding_dim]
-   */
-  void add_text_embeddings(const TensorStruct<float>& text_embeddings);
+  // Append embeddings
+  void add_embeddings(const TensorStruct<float>& embedding);
+  void add_embeddings(const executorch::aten::Tensor& embedding);
 
-  /**
-   * @brief Add image embeddings to the collection
-   *
-   * @param image_embeddings Image embedding tensor [1, num_tokens,
-   * embedding_dim]
-   */
-  void add_image_embeddings(const executorch::aten::Tensor& image_embeddings);
-
-  /**
-   * @brief Merge collected embeddings based on input token IDs
-   *
-   * This method examines each token ID in input_ids. When it encounters
-   * placeholder_token_id, it inserts the next image embedding. Otherwise,
-   * it inserts the text embedding at the corresponding position.
-   *
-   * @param input_ids Vector of token IDs (including placeholder tokens)
-   * @param image_token_id Token ID that represents image modality placeholder
-   * @return TensorStruct<float> Merged embeddings [1, total_tokens,
-   * embedding_dim]
-   */
-  TensorStruct<float> merge(
-      const std::vector<uint64_t>& input_ids,
-      uint64_t image_token_id);
-
-  /**
-   * @brief Get the total number of tokens after merging
-   * @return int64_t Total token count
-   */
-  inline size_t get_total_tokens() const {
+  TensorStruct<float> get_merged_embeddings();
+  int32_t get_total_tokens() const noexcept {
     return total_tokens_;
   }
 
  private:
-  void add_embeddings(
-      const executorch::aten::Tensor& embeddings,
+  // Validates shape [batch size, num tokens, dim] and appends data.
+  void append_data(
       const float* data,
-      EmbeddingType type);
+      ssize_t ndim,
+      const executorch::aten::SizesType* sizes);
 
-  // Expected embedding dimension
   int32_t embedding_dim_;
-
-  // Total tokens after merge
   int32_t total_tokens_{0};
 
-  // Collected embeddings before merge
-  // Text embeddings are copied to prevent external modifications
-  std::vector<std::vector<float>> text_embedding_buffers_;
-  std::vector<int64_t> text_embedding_token_counts_;
-
-  // Image embeddings are copied since they're temporary
-  std::vector<std::vector<float>> image_embedding_buffers_;
-  std::vector<int64_t> image_embedding_token_counts_;
+  // merged embeddings are holded in this vector.
+  std::vector<float> embeddings_;
+  std::array<executorch::aten::TensorImpl::SizesType, 3> sizes_{};
 };
 
 } // namespace example
diff --git a/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/multimodal_runner.cpp b/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/multimodal_runner.cpp
index 0ac38308bfe..32e3baf27a9 100644
--- a/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/multimodal_runner.cpp
+++ b/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/multimodal_runner.cpp
@@ -32,6 +32,7 @@
 
 using executorch::aten::Tensor;
 using executorch::extension::Module;
+using executorch::extension::llm::Audio;
 using executorch::extension::llm::get_rss_bytes;
 using executorch::extension::llm::Image;
 using executorch::extension::llm::MultimodalInput;
@@ -137,6 +138,8 @@ QNNMultimodalRunner<T>::QNNMultimodalRunner(
     model_version_ = VisionLanguageModel::kSmolvlm;
   } else if (model_version == "internvl3") {
     model_version_ = VisionLanguageModel::kInternvl3;
+  } else if (model_version == "granite_speech") {
+    model_version_ = AudioLanguageModel::kGraniteSpeech;
   } else {
     ET_CHECK_MSG(false, "Unsupported Decoder Model");
   }
@@ -203,6 +206,11 @@ Error QNNMultimodalRunner<T>::load() {
     } else if (*vlm == VisionLanguageModel::kInternvl3) {
       eos_ids->insert(tokenizer_->encode("<|im_end|>", 0, 0).get()[0]);
     }
+  } else if (
+      const auto* alm = std::get_if<AudioLanguageModel>(&model_version_)) {
+    if (*alm == AudioLanguageModel::kGraniteSpeech) {
+      eos_ids->insert(tokenizer_->encode("<|end_of_text|>", 0, 0).get()[0]);
+    }
   }
 
   Result<MethodMeta> method_meta =
@@ -394,18 +402,6 @@ Error QNNMultimodalRunner<T>::load() {
       buffer_manager_.get(),
       tok_embedding_->method_meta(tok_embedding_method_name));
 
-  // Get image token ID from text_decoder
-  if (modality_of(model_version_) == Modality::kVision) {
-    ET_CHECK_MSG(
-        text_decoder_->method_names()->count("image_token_id") > 0,
-        "Vision model is missing the required 'image_token_id' in metadata.");
-    image_token_id_ = ET_UNWRAP(text_decoder_->get("image_token_id")).toInt();
-    ET_LOG(
-        Info,
-        "Image placeholder token ID for vision modality loaded: %zu",
-        image_token_id_);
-  }
-
   // Initialize embedding merger
   embedding_merger_ =
       std::make_unique<MultimodalEmbeddingMerger>(static_cast<int32_t>(dim));
@@ -470,10 +466,21 @@ executorch::runtime::Error QNNMultimodalRunner<T>::generate(
           tok_embedding_processor_->get_prompt_embeddings();
 
       // Add text embeddings to merger
-      embedding_merger_->add_text_embeddings(text_embeddings);
+      embedding_merger_->add_embeddings(text_embeddings);
 
       prompt_tokens.insert(prompt_tokens.end(), tokens.begin(), tokens.end());
 
+    } else if (input.is_audio()) {
+      const Audio& audio = input.get_audio();
+      auto audio_tensor_res = audio.toTensor();
+      executorch::extension::TensorPtr audio_tensor_ptr =
+          audio_tensor_res.get();
+
+      auto encode_res = encoder_runner_->encode(audio_tensor_ptr);
+      executorch::aten::Tensor audio_embeddings_tensor = encode_res.get();
+
+      // Add audio embeddings to merger
+      embedding_merger_->add_embeddings(audio_embeddings_tensor);
     } else if (input.is_image()) {
       const Image& image = input.get_image();
       auto image_tensor_res = image.toTensor(/*with_batch*/ true);
@@ -484,16 +491,15 @@ executorch::runtime::Error QNNMultimodalRunner<T>::generate(
       executorch::aten::Tensor image_embeddings_tensor = encode_res.get();
 
       // Add image embeddings to merger
-      embedding_merger_->add_image_embeddings(image_embeddings_tensor);
+      embedding_merger_->add_embeddings(image_embeddings_tensor);
 
     } else {
       ET_CHECK_MSG(false, "Unsupported input data type");
     }
   }
 
-  // Fuse embeddings by placeholder_token_id from model
   TensorStruct<float> merged_embeddings =
-      embedding_merger_->merge(prompt_tokens, image_token_id_);
+      embedding_merger_->get_merged_embeddings();
   int num_prompt_tokens = embedding_merger_->get_total_tokens();
 
   ET_CHECK_MSG(num_prompt_tokens >= 1, "Expected at least 1 prompt token");
diff --git a/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/multimodal_runner.h b/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/multimodal_runner.h
index 4bf58c10339..5407d5712b7 100644
--- a/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/multimodal_runner.h
+++ b/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/multimodal_runner.h
@@ -26,6 +26,7 @@
 #include <executorch/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/multimodal_token_generator.h>
 #include <executorch/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/tok_embedding_processor.h>
 #include <executorch/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/tok_embedding_runner.h>
+#include <executorch/extension/llm/runner/audio.h>
 #include <executorch/extension/llm/runner/image.h>
 #include <executorch/extension/llm/runner/irunner.h>
 #include <executorch/extension/llm/runner/multimodal_input.h>
@@ -46,8 +47,9 @@ enum class VisionLanguageModel {
   kInternvl3,
 };
 
-// TODO: Add audio models when they are supported
-enum class AudioLanguageModel {};
+enum class AudioLanguageModel {
+  kGraniteSpeech = 0,
+};
 
 using ModelVersion = std::variant<VisionLanguageModel, AudioLanguageModel>;
 
@@ -148,11 +150,6 @@ class QNNMultimodalRunner
   std::unique_ptr<TokenEmbeddingProcessor> tok_embedding_generator_;
   std::unique_ptr<MultimodalEmbeddingMerger> embedding_merger_;
 
-  // Placeholder token ID for image inputs. This value will be set from the
-  // model's metadata. A default of 0 indicates that the vision modality is not
-  // supported.
-  uint64_t image_token_id_{0};
-
   // stats
   executorch::llm::Stats stats_;
 };
diff --git a/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/tok_embedding_processor.cpp b/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/tok_embedding_processor.cpp
index d0566941b06..c534c197033 100644
--- a/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/tok_embedding_processor.cpp
+++ b/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/tok_embedding_processor.cpp
@@ -96,16 +96,11 @@ void TokenEmbeddingProcessor::prefill(
   // Allocate memory using std::vector for smart pointer management
   prompt_embeddings_buffer_.resize(num_prompt_tokens * metadata_.embedding_dim);
   prompt_embeddings_.data = prompt_embeddings_buffer_.data();
-
-  // Create TensorImpl for prompt_embeddings_ with shape [1, num_prompt_tokens,
-  // dim] Store sizes and dim_order as member variables to keep them
-  // alive
-  std::vector<TensorImpl::SizesType> sizes = {
-      1, num_prompt_tokens, metadata_.embedding_dim};
+  prompt_embeddings_sizes_ = {1, num_prompt_tokens, metadata_.embedding_dim};
   prompt_embeddings_.tensor = std::make_unique<TensorImpl>(
       executorch::aten::ScalarType::Float,
-      sizes.size(),
-      sizes.data(),
+      prompt_embeddings_sizes_.size(),
+      prompt_embeddings_sizes_.data(),
       prompt_embeddings_.data);
 
   int num_iters = 1 + ((num_prompt_tokens - 1) / metadata_.ar_len);
diff --git a/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/tok_embedding_processor.h b/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/tok_embedding_processor.h
index f5dee69bf3a..416e9db6b23 100644
--- a/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/tok_embedding_processor.h
+++ b/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/tok_embedding_processor.h
@@ -86,6 +86,8 @@ class TokenEmbeddingProcessor {
   TensorStruct<float> embeddings_;
   TensorStruct<float> prompt_embeddings_;
   std::vector<float> prompt_embeddings_buffer_;
+  std::array<executorch::aten::TensorImpl::SizesType, 3>
+      prompt_embeddings_sizes_{};
 
   std::vector<executorch::runtime::EValue> inputs_;
   std::vector<executorch::aten::Tensor> input_tensors_;
diff --git a/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/utils.h b/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/utils.h
index 4b16cf646cb..e9240a529d3 100644
--- a/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/utils.h
+++ b/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/utils.h
@@ -21,6 +21,7 @@
 #include <vector>
 
 using ::executorch::aten::ScalarType;
+using ::executorch::extension::llm::Audio;
 using ::executorch::extension::llm::Image;
 using ::executorch::extension::llm::MultimodalInput;
 
@@ -69,6 +70,51 @@ inline std::vector<std::string> load_raw_files(
   return input_files;
 }
 
+void load_audio(
+    const std::string& audio_path,
+    Audio& audio,
+    const std::vector<int32_t>& expected_size,
+    const ScalarType& expected_dtype) {
+  const size_t n = expected_size.size();
+  ET_CHECK_MSG(n >= 3, "expected dim should at least be 3, but got %zu", n);
+  const int32_t batch_size = expected_size[n - 3];
+  const int32_t n_bins = expected_size[n - 2];
+  const int32_t n_frames = expected_size[n - 1];
+
+  size_t num_elems = std::accumulate(
+      expected_size.begin(),
+      expected_size.end(),
+      size_t{1},
+      std::multiplies<size_t>());
+
+  std::streamsize expected_length = num_elems * sizeof(float);
+
+  std::ifstream file(audio_path, std::ios::binary | std::ios::ate);
+  ET_CHECK_MSG(
+      file.is_open(), "Failed to open input file: %s", audio_path.c_str());
+
+  std::streamsize file_size = file.tellg();
+  ET_CHECK_MSG(
+      file_size == expected_length,
+      "Input audio size mismatch. file bytes: %ld, expected bytes: %zu (file: "
+      "%s)",
+      file_size,
+      expected_length,
+      audio_path.c_str());
+  file.seekg(0, std::ios::beg);
+  std::vector<float> buffer(num_elems);
+  file.read(reinterpret_cast<char*>(buffer.data()), expected_length);
+  file.close();
+
+  audio = Audio(std::move(buffer), batch_size, n_bins, n_frames);
+  ET_LOG(
+      Info,
+      "audio Batch Size: %" PRId32 ", N_bins: %" PRId32 ", Frames: %" PRId32,
+      audio.get_batch_size(),
+      audio.get_n_bins(),
+      audio.get_n_frames());
+}
+
 void load_image(
     const std::string& image_path,
     Image& image,
diff --git a/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/vision_chat_template.h b/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/vision_chat_template.h
index 283080f9935..9b55230ac42 100644
--- a/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/vision_chat_template.h
+++ b/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/vision_chat_template.h
@@ -13,12 +13,12 @@
 #include <string>
 #include <vector>
 
-const std::string IMG_TOKEN = "<image>";
+inline const std::string IMG_TOKEN = "<image>";
 
 /**
  * Special tokens structure for vision modality
  */
-struct SpecialTokens {
+struct VisionSpecialTokens {
   std::string image_token;
   std::string global_img;
   std::string fake_wrap_start;
@@ -28,19 +28,18 @@ struct SpecialTokens {
 /**
  * Get special tokens based on model version
  */
-inline SpecialTokens get_special_tokens(
+inline VisionSpecialTokens get_special_tokens(
     example::VisionLanguageModel model_version) {
-  SpecialTokens tokens;
+  VisionSpecialTokens tokens;
+  tokens.image_token = IMG_TOKEN;
 
   switch (model_version) {
     case example::VisionLanguageModel::kSmolvlm:
-      tokens.image_token = "<image>";
       tokens.global_img = "<global-img>";
       tokens.fake_wrap_start = "<fake_token_around_image>";
       tokens.fake_wrap_end = "<fake_token_around_image>";
       break;
     case example::VisionLanguageModel::kInternvl3:
-      tokens.image_token = "<IMG_CONTEXT>";
       tokens.global_img = "";
       tokens.fake_wrap_start = "<img>";
       tokens.fake_wrap_end = "</img>";
@@ -59,18 +58,17 @@ inline SpecialTokens get_special_tokens(
  */
 inline std::string expand_image_tokens(
     const std::string& prompt,
-    const SpecialTokens& specials) {
+    const VisionSpecialTokens& specials) {
   // Create image prompt with repeated image tokens
-  std::string image_prompt = specials.fake_wrap_start;
-  image_prompt += specials.global_img;
-  image_prompt += specials.image_token;
-  image_prompt += specials.fake_wrap_end;
+  const std::string image_prompt = specials.fake_wrap_start +
+      specials.global_img + specials.image_token + specials.fake_wrap_end;
 
   // Replace single image token with expanded version
   size_t pos = 0;
   std::string expanded = prompt;
-  while ((pos = expanded.find(IMG_TOKEN, pos)) != std::string::npos) {
-    expanded.replace(pos, IMG_TOKEN.size(), image_prompt);
+  while ((pos = expanded.find(specials.image_token, pos)) !=
+         std::string::npos) {
+    expanded.replace(pos, specials.image_token.size(), image_prompt);
     pos += image_prompt.size();
   }
   ET_LOG(Info, "Prompt after expanding image token: %s", expanded.c_str());
@@ -86,7 +84,7 @@ inline std::string apply_chat_template(
     const std::string& prompt,
     example::VisionLanguageModel model_version) {
   std::string formatted_prompt;
-  SpecialTokens specials = get_special_tokens(model_version);
+  VisionSpecialTokens specials = get_special_tokens(model_version);
 
   switch (model_version) {
     case example::VisionLanguageModel::kSmolvlm: {
@@ -113,7 +111,7 @@ inline std::string apply_chat_template(
       break;
     }
     default:
-      ET_CHECK_MSG(false, "unsupported VLM version");
+      ET_CHECK_MSG(false, "unsupported Vision-Language model version");
       break;
   }
   return formatted_prompt;
diff --git a/examples/qualcomm/oss_scripts/llama/static_llm_quant_recipe.py b/examples/qualcomm/oss_scripts/llama/static_llm_quant_recipe.py
index ca06a89142e..4eecf308f88 100644
--- a/examples/qualcomm/oss_scripts/llama/static_llm_quant_recipe.py
+++ b/examples/qualcomm/oss_scripts/llama/static_llm_quant_recipe.py
@@ -407,6 +407,45 @@ def __init__(self, verbose: bool = False):
         self.recipe.custom_quant_annotations.append(annotate_kv_8bit)
 
 
+class GraniteSpeech_3_3_2B_InstructQuantRecipe(StaticLLMQuantRecipe):
+    default_quant_dtype = QuantDtype.use_16a4w
+
+    def __init__(self, verbose: bool = False):
+        super().__init__()
+
+        self.recipe = (
+            QuantRecipe(
+                self.default_quant_dtype,
+                False,
+                act_observer=MinMaxObserver,
+                granularity=QuantGranularity.PER_TENSOR,
+                verbose=verbose,
+            )
+            .add_node_target(
+                {
+                    torch.ops.aten.conv2d.default,
+                },
+                QuantDtype.use_16a4w_block,
+                False,
+                act_observer=MinMaxObserver,
+                granularity=QuantGranularity.PER_BLOCK,
+                extra_kwargs={"block_size": (1, 64, 1, 1)},
+            )
+            .add_regex(
+                {
+                    r"layers\..*\.attention\.wv.*",
+                    r"layers\..*\.feed_forward\.w3_conv",
+                    r"layers\..*\.feed_forward\.w2_conv",
+                    r"output\.conv",
+                },
+                QuantDtype.use_16a8w,
+                False,
+                act_observer=MinMaxObserver,
+                granularity=QuantGranularity.PER_CHANNEL,
+            )
+        )
+
+
 class InternVL3_1B_QuantRecipe(StaticLLMQuantRecipe):
     default_quant_dtype = QuantDtype.use_16a8w
 
diff --git a/examples/qualcomm/oss_scripts/llama/tokenizer.py b/examples/qualcomm/oss_scripts/llama/tokenizer.py
index 3befa71168b..73cd2f436f6 100644
--- a/examples/qualcomm/oss_scripts/llama/tokenizer.py
+++ b/examples/qualcomm/oss_scripts/llama/tokenizer.py
@@ -13,6 +13,7 @@
 
 from executorch.examples.qualcomm.oss_scripts.llama import LLMModelConfig
 from executorch.examples.qualcomm.oss_scripts.llama.decoder_constants import (
+    AUDIO_ENCODER,
     VISION_ENCODER,
 )
 from pytorch_tokenizers import get_tokenizer, TiktokenTokenizer
@@ -37,8 +38,12 @@
         "fake_wrap_end": "</img>",
     },
 }
-# TODO: add special tokens Audio-Language Model
-ALM_SPECIAL_TOKENS = {}
+# Special tokens for Audio-Language Model
+ALM_SPECIAL_TOKENS = {
+    "granite_speech_3_3-2b": {
+        AUDIO_TOKEN: "<|audio|>",
+    }
+}
 
 
 class TokenizerWrapper:
@@ -124,7 +129,7 @@ def get_runtime_tokenizer(self, tokenizer_model, tokenizer_bin):
 
         return runtime_tokenizer_path, tokenizer, chat_template
 
-    def prepare_messages(self, prompts: List[str]):
+    def prepare_messages(self, prompts: List[str]):  # noqa: C901
         """
         Validate and normalize a multi-turn prompt sequence, then prepare it into
         a message list.
@@ -177,6 +182,30 @@ def prepare_messages(self, prompts: List[str]):
 
         messages = []
 
+        audio_paths = self.control_args.audio_path
+        if hasattr(self.config, AUDIO_ENCODER):
+            # Load image from user-specified path (URL or local file)
+            # fall back to the default image URL if no image is provided.
+            if not audio_paths:
+                audio_paths = [getattr(self.config, AUDIO_ENCODER).audio_url]
+                warnings.warn(
+                    f"No audio path/URL provided, using default audio URL from huggingface: {audio_paths}",
+                    UserWarning,
+                    stacklevel=1,
+                )
+            num_audios = len(audio_paths)
+            total_audio_tokens = sum(prompt.count(AUDIO_TOKEN) for prompt in prompts)
+            if total_audio_tokens == 0:
+                prompts[0] = (AUDIO_TOKEN * num_audios) + prompts[0]
+            elif total_audio_tokens != num_audios:
+                raise ValueError(
+                    f"Number of <audio> tokens ({total_audio_tokens}) does not match "
+                    f"number of audios ({num_audios}). Please check your prompts and audio paths."
+                    "Please check your prompts and audio paths.\n\n"
+                    f"=== Prompt ===\n{prompts}\n"
+                    f"=== Audio paths ===\n{audio_paths}"
+                )
+
         image_paths = self.control_args.image_path
         if hasattr(self.config, VISION_ENCODER):
             # Load image from user-specified path (URL or local file)
@@ -190,7 +219,6 @@ def prepare_messages(self, prompts: List[str]):
                 )
 
             num_images = len(image_paths)
-
             total_image_tokens = sum(prompt.count(IMG_TOKEN) for prompt in prompts)
 
             if total_image_tokens == 0:
@@ -204,9 +232,14 @@ def prepare_messages(self, prompts: List[str]):
                     f"=== Image paths ===\n{image_paths}"
                 )
 
+        audio_idx = 0
         img_idx = 0
         for i, prompt in enumerate(prompts):
             message = {"id": i, "text": prompt, "files_path": []}
+            if AUDIO_TOKEN in prompt:
+                num_audio = prompt.count(AUDIO_TOKEN)
+                message["files_path"] = audio_paths[audio_idx : audio_idx + num_audio]
+                audio_idx += num_audio
             if IMG_TOKEN in prompt:
                 num_img = prompt.count(IMG_TOKEN)
                 message["files_path"] = image_paths[img_idx : img_idx + num_img]
@@ -244,7 +277,19 @@ def prepare_multimodal_prompt(
                 f"No special tokens defined for model {self.decoder_model}"
             )
 
-        if self.decoder_model in VLM_SPECIAL_TOKENS:
+        if self.decoder_model in ALM_SPECIAL_TOKENS:
+            specials = ALM_SPECIAL_TOKENS[self.decoder_model]
+            audio_seq_len = getattr(self.config, AUDIO_ENCODER, None).audio_seq_len
+
+            # Build the expanded audio prompt
+            audio_prompt = f"{specials[AUDIO_TOKEN] * audio_seq_len}"
+            # Replace audio token with expanded version
+            expanded = prompt.replace(specials[AUDIO_TOKEN], audio_prompt)
+
+            if self.verbose:
+                logging.info(f"Prompt after expanding audio token: {expanded}")
+            return expanded
+        elif self.decoder_model in VLM_SPECIAL_TOKENS:
             specials = VLM_SPECIAL_TOKENS[self.decoder_model]
 
             image_seq_len = getattr(self.config, VISION_ENCODER, None).img_seq_len
@@ -262,10 +307,10 @@ def prepare_multimodal_prompt(
                 logging.info(f"Prompt after expanding image token: {expanded}")
 
             return expanded
-
-        elif self.decoder_model in ALM_SPECIAL_TOKENS:
+        else:
             raise NotImplementedError(
-                "Audio-language model expanded tokens still under development"
+                f"Expanded tokens are not supported by the current multimodal {self.decoder_model}. "
+                "Please add a compatible encoder."
             )
 
     def _split_prompt(self, prompt: str):
@@ -322,7 +367,15 @@ def apply_prompt_template(
                         {"type": "text", "text": content},
                     )
         elif self.decoder_model in ALM_SPECIAL_TOKENS:
-            message["content"] = prompt
+            contents = self._split_prompt(prompt)
+            message["content"] = ""
+            for content in contents:
+                if content == AUDIO_TOKEN:
+                    message["content"] += ALM_SPECIAL_TOKENS[self.decoder_model][
+                        AUDIO_TOKEN
+                    ]
+                else:
+                    message["content"] += content
 
         messages.append(message)
         if system_prompt:
diff --git a/examples/qualcomm/oss_scripts/llama/wrappers/base_component.py b/examples/qualcomm/oss_scripts/llama/wrappers/base_component.py
index 532ab718d28..5b8a2dcc21c 100644
--- a/examples/qualcomm/oss_scripts/llama/wrappers/base_component.py
+++ b/examples/qualcomm/oss_scripts/llama/wrappers/base_component.py
@@ -127,19 +127,15 @@ def get_model_specific_kwargs(control_args: argparse.Namespace, config: LLMModel
 
     """
     kwargs = {}
-    # Vision-Language Model (VLM)
-    # For multimodal models, we need the special token ID that represents image placeholders
-    # in the input sequence. This token is used to mark positions where image embeddings
+    # For multimodal models, we need the special token ID that represents modality placeholders
+    # in the input sequence. This token is used to mark positions where modality embeddings
     # should be inserted during inference.
+    if hasattr(config, AUDIO_ENCODER):
+        hf_config = AutoConfig.from_pretrained(config.repo_id)
+        kwargs["audio_token_id"] = hf_config.audio_token_index
     if hasattr(config, VISION_ENCODER):
         hf_config = AutoConfig.from_pretrained(config.repo_id)
         kwargs["image_token_id"] = hf_config.image_token_id
-    # TODO: Support Audio modality
-    if hasattr(config, AUDIO_ENCODER):
-        raise NotImplementedError(
-            "Audio encoder modality is not currently supported. "
-            "Please provide a valid audio_token_id in kwargs."
-        )
     return kwargs
 
 
diff --git a/examples/qualcomm/oss_scripts/llama/wrappers/llm_wrappers.py b/examples/qualcomm/oss_scripts/llama/wrappers/llm_wrappers.py
index 4d63b2471ff..d1e1f92906c 100644
--- a/examples/qualcomm/oss_scripts/llama/wrappers/llm_wrappers.py
+++ b/examples/qualcomm/oss_scripts/llama/wrappers/llm_wrappers.py
@@ -8,6 +8,7 @@
 import json
 import logging
 import os
+import re
 import time
 import types
 
@@ -59,6 +60,9 @@
 from executorch.examples.qualcomm.oss_scripts.llama.decoder_utils import (
     graph_module_inference,
 )
+from executorch.examples.qualcomm.oss_scripts.llama.encoder.encoder_config import (
+    GraniteSpeechEncoder,
+)
 from executorch.examples.qualcomm.oss_scripts.llama.encoder.encoder_quant_recipe import (
     EncoderQuantRecipe,
 )
@@ -91,7 +95,7 @@
 from torchao.prototype.spinquant import apply_spinquant
 from torchao.quantization.pt2e import MinMaxObserver
 from torchao.quantization.pt2e.quantize_pt2e import convert_pt2e, prepare_pt2e
-from transformers import AutoModel
+from transformers import AutoModel, AutoModelForSpeechSeq2Seq
 
 
 class TextDecoder(Component):
@@ -273,9 +277,14 @@ def _get_model_instance(self) -> LlamaModel:
         # get embedding model
         tok_embedding = None
         if self.apply_embedding:
-            auto_model = AutoModel.from_pretrained(
-                self.config.repo_id, _attn_implementation="eager"
-            )
+            if hasattr(self.config, AUDIO_ENCODER):
+                auto_model = AutoModelForSpeechSeq2Seq.from_pretrained(
+                    self.config.repo_id, _attn_implementation="eager"
+                )
+            elif hasattr(self.config, VISION_ENCODER):
+                auto_model = AutoModel.from_pretrained(
+                    self.config.repo_id, _attn_implementation="eager"
+                )
             tok_embedding = TokenEmbedding(
                 auto_model.get_input_embeddings().to(torch.float32),
                 self.model_args.max_batch_size,
@@ -538,6 +547,7 @@ def _calibrate(
                 hidden_states=hidden_states,  # hidden_states for multimodal
                 module=model,
                 tok_embedding=tok_embedding,
+                audio_token_id=self.meta.get("audio_token_id", None),
                 image_token_id=self.meta.get("image_token_id", None),
                 tokenizer=tokenizer,
                 ar_len=self.meta["get_ar_len"],
@@ -672,10 +682,24 @@ def quantize(self, request: Request):  # noqa: C901
                 self.tok_embedding = convert_pt2e(self.tok_embedding)
 
             if self.control_args.verbose and self.mode == Mode.DECODE:
-                if self.apply_embedding:
-                    qdq_intermediate_outputs = request.method_data[
-                        VISION_ENCODER
-                    ].calibration_data.qdq_intermediate_outputs
+                audio_turns = request.method_data[
+                    AUDIO_ENCODER
+                ].calibration_data.qdq_intermediate_outputs
+                vision_turns = request.method_data[
+                    VISION_ENCODER
+                ].calibration_data.qdq_intermediate_outputs
+                if audio_turns is None:
+                    audio_turns = [
+                        [] for _ in range(len(data.calibration_data.datasets))
+                    ]
+                if vision_turns is None:
+                    vision_turns = [
+                        [] for _ in range(len(data.calibration_data.datasets))
+                    ]
+                qdq_intermediate_outputs = [
+                    [*audio_turn, *vision_turn]
+                    for audio_turn, vision_turn in zip(audio_turns, vision_turns)
+                ]
                 self._calibrate(
                     model=self.decoder,
                     tokenizer=data.tokenizer,
@@ -955,18 +979,25 @@ def __init__(
         repo_id = config.repo_id
 
         if config := getattr(config, modality, None):
-            if modality == TEXT_ENCODER or modality == AUDIO_ENCODER:
+            if modality == AUDIO_ENCODER:
+                auto_model = AutoModelForSpeechSeq2Seq.from_pretrained(repo_id)
+                self.num_layers = auto_model.config.encoder_config.num_layers
+                self.ctx_size = auto_model.config.encoder_config.context_size
+            elif modality == TEXT_ENCODER:
                 raise NotImplementedError(f"{modality} is under development")
+            elif modality == VISION_ENCODER:
+                auto_model = AutoModel.from_pretrained(
+                    repo_id, _attn_implementation="eager"
+                )
+                self.num_layers = auto_model.config.vision_config.num_hidden_layers
+            else:
+                raise NotImplementedError(f"Find no {modality}")
 
-            auto_model = AutoModel.from_pretrained(
-                repo_id, _attn_implementation="eager"
-            )
-            # Create an instance of the config class since it has init=False
-            self.model = config().create_encoder(auto_model.config)
-            # set strict to false to simplify parameter loading for non-text models
-            auto_model = auto_model.eval()
-            self.model = self.model.eval()
-            self.model.load_state_dict(auto_model.state_dict(), strict=False)
+            auto_model = auto_model.to(torch.float32).eval()
+            self.model = config().create_encoder(auto_model.config).eval()
+            self.model.load_state_dict(
+                auto_model.state_dict(), strict=False
+            )  # set strict to false to simplify parameter loading for non-text models
             self.example_input = self.model.get_example_inputs()
 
             # set quant recipe
@@ -974,16 +1005,76 @@ def __init__(
                 config.quant_recipe(True) if config.quant_recipe else None
             )
 
+            # metadata
+            self.config = config
+
+        self.passes_job = get_capture_program_passes()
+        self.dep_table = get_passes_dependency_for_capture_program()
+
+    def _tag_ios(self, node, fixed_point_type):
+        quant_io_type = None
+
+        # tag sharding io
+        if exir_ops.edge.llama.fallback.default in [
+            u.target for u in list(node.users.keys())
+        ] + [node.target]:
+            quant_io_type = fixed_point_type["io_type"]
+
+        # GraniteSpeech: tag _to_copy op as quantized tensors for attn dist. It is caused by sharding
+        if (
+            issubclass(self.config, GraniteSpeechEncoder)
+            and node.target == exir_ops.edge.aten._to_copy.default
+            and node.meta["val"].size() == (self.ctx_size, self.ctx_size)
+        ):
+            quant_io_type = torch.int32
+
+        return quant_io_type
+
+    def _get_sharding_get_pattern(self):
+        prefixes = [
+            "encoder.layers",
+            "vision_tower.encoder.layer",
+            "vision_model.encoder.layers",
+        ]
+        prefix_alt = "|".join(re.escape(p) for p in prefixes)
+        return rf"^(?:{prefix_alt})\.(\d+)"
+
     def compile(self, request: Request):
         if self.model is None:
             return
 
         request_data = request.method_data[self.modality]
+        # check if sharding required
+        if self.config.num_sharding > 1:
+            SplitGraph, setting = model_sharding.get_split_graph_pass(
+                self.num_layers,
+                shares=self.config.num_sharding,
+                pattern=self._get_sharding_get_pattern(),
+            )
+            self.passes_job[SplitGraph] = setting
+            self.dep_table[SplitGraph] = [FoldQDQ]
+            self.dep_table[TagQuantIO] = [SplitGraph]
+
+            if not request_data.skip_quantize:
+                fixed_point_type = {"io_type": torch.uint16}
+
+                # setup quantized IO
+                self.passes_job[TagQuantIO][QCOM_PASS_ACTIVATE_KEY] = True
+                self.passes_job[TagQuantIO][QCOM_PASS_ARGS_KWARGS_DEFAULTS_KEY][
+                    "get_quant_io_dtype_fn"
+                ] = partial(self._tag_ios, fixed_point_type=fixed_point_type)
+
         edge_prog_mgr = to_edge_transform_and_lower_to_qnn(
             module=self.model,
             inputs=self.example_input,
             compiler_specs=request_data.compile_spec,
+            dep_table=self.dep_table,
+            passes_job=self.passes_job,
+            skip_node_op_set={"llama.fallback.default"},
         )
+
+        if self.config.num_sharding > 1:
+            update_spill_fill_size(edge_prog_mgr.exported_program())
         if self.control_args.verbose:
             print_delegation_info(edge_prog_mgr.exported_program().graph_module)
 
diff --git a/extension/llm/custom_ops/model_sharding.py b/extension/llm/custom_ops/model_sharding.py
index df87274a115..6838b0958a2 100644
--- a/extension/llm/custom_ops/model_sharding.py
+++ b/extension/llm/custom_ops/model_sharding.py
@@ -47,9 +47,10 @@ class SplitGraph(ExportPass):
     not load all llama model in one pte.
     """
 
-    def __init__(self, shard_layers: List[int]):
+    def __init__(self, shard_layers: List[int], pattern=r"layers.(\d+)"):
         super().__init__()
         self.shard_layers = shard_layers
+        self.pattern = pattern
 
     def _insert_fallback_op(
         self, graph_module: torch.fx.GraphModule
@@ -62,7 +63,6 @@ def _insert_fallback_op(
             The second partition will contain layers [4, 8).
             The third partition will contain layers [8, 12) and output.
         """
-        pattern = r"layers.(\d+)"
         prev_node = None
         prev_layer = None
         for node in graph_module.graph.nodes:
@@ -72,7 +72,7 @@ def _insert_fallback_op(
             module_values_list = list(node.meta["nn_module_stack"].values())
             full_qualified_name = module_values_list[-1][0]
             # Search which layer this node belongs to
-            match = re.search(pattern, full_qualified_name)
+            match = re.search(self.pattern, full_qualified_name)
             if match is None:
                 continue
 
@@ -103,15 +103,20 @@ def call(self, graph_module: torch.fx.GraphModule):
         return PassResult(graph_module, True)
 
 
-def split_graph(edge_program: ExportedProgram, num_layers: int, shares: int):
+def split_graph(
+    edge_program: ExportedProgram, num_layers: int, shares: int, pattern=r"layers.(\d+)"
+):
     graph_module = edge_program.graph_module
     shard_layers = list(range(0, num_layers, int(num_layers / shares)))
-    return SplitGraph(shard_layers)(graph_module)
+    return SplitGraph(shard_layers, pattern=pattern)(graph_module)
 
 
-def get_split_graph_pass(num_layers: int, shares: int):
+def get_split_graph_pass(num_layers: int, shares: int, pattern=r"layers.(\d+)"):
     shard_layers = list(range(0, num_layers, int(num_layers / shares)))
     return SplitGraph, {
         QCOM_PASS_ACTIVATE_KEY: True,
-        QCOM_PASS_ARGS_KWARGS_DEFAULTS_KEY: {"shard_layers": shard_layers},
+        QCOM_PASS_ARGS_KWARGS_DEFAULTS_KEY: {
+            "shard_layers": shard_layers,
+            "pattern": pattern,
+        },
     }