[
  {
    "Category": "Dataset Resource",
    "Type": "Dataset",
    "Abbreviation": "BEA-Dialogue+",
    "Title": "Scaling Conversational Hungarian ASR: The BEA-Dialogue+ Corpus",
    "Time": "2026-05",
    "Affiliation": "",
    "Author": "Máté Gedeon, Piroska Zsófia Barta, Péter Mihajlik, Katalin Mády",
    "GitHub_Link": "",
    "Paper_Link": "https://arxiv.org/abs/2605.31469",
    "HF_Link": "",
    "Demo_Link": "",
    "Other_Link": "",
    "Audio_Input": "Yes",
    "Audio_Output": "No",
    "Language": "Hungarian",
    "Description": "BEA-Dialogue+ is an expanded conversational Hungarian ASR corpus that relaxes the strictly speaker-disjoint split of BEA-Dialogue while preserving separation of the primary speakers, yielding 200 hours of transcribed natural conversation (up from 85). It enables a controlled study of the trade-off between additional training data and speaker overlap, evaluated with Whisper- and FastConformer-based models."
  },
  {
    "Category": "Speech Synthesis",
    "Type": "Model",
    "Abbreviation": "Chatterbox-Flash",
    "Title": "Chatterbox-Flash: Prior-Calibrated Block Diffusion for Streaming Zero-Shot TTS",
    "Time": "2026-05",
    "Affiliation": "",
    "Author": "Deokjin Seo, Gangin Park, Kihyun Nam",
    "GitHub_Link": "",
    "Paper_Link": "https://arxiv.org/abs/2605.30748",
    "HF_Link": "",
    "Demo_Link": "",
    "Other_Link": "",
    "Audio_Input": "Yes",
    "Audio_Output": "Yes",
    "Language": "",
    "Description": "Chatterbox-Flash is a zero-shot TTS model created by fine-tuning a pretrained autoregressive TTS decoder into a block-diffusion decoder, enabling parallel token generation within each block while retaining block-by-block streaming. It introduces two inference-time techniques—prior-calibrated scoring and an early-decoding schedule—to counter the long-tail token bias that otherwise degrades parallel decoding quality."
  },
  {
    "Category": "Multimodal",
    "Type": "Model",
    "Abbreviation": "MindVoice",
    "Title": "MindVoice: Reconstructing Intelligible Speech from Non-invasive Neural Signals with Pretrained Priors",
    "Time": "2026-05",
    "Affiliation": "",
    "Author": "Guangyin Bao, Taiping Zeng, Jianfeng Feng, Xiangyang Xue",
    "GitHub_Link": "",
    "Paper_Link": "https://arxiv.org/abs/2605.31173",
    "HF_Link": "",
    "Demo_Link": "",
    "Other_Link": "",
    "Audio_Input": "No",
    "Audio_Output": "Yes",
    "Language": "",
    "Description": "MindVoice is a neuro-to-speech reconstruction framework that recovers intelligible speech from noisy, spatially-blurred non-invasive neural recordings by leveraging pretrained models to compensate for incomplete semantic and acoustic information. It targets safe, scalable speech brain-computer interfaces, moving past prior methods that produced spectrally-similar but unintelligible output."
  },
  {
    "Category": "Benchmark",
    "Type": "Benchmark",
    "Abbreviation": "SURE",
    "Title": "A Unified and Reproducible Experimentation Framework for Speech Understanding",
    "Time": "2026-05",
    "Affiliation": "",
    "Author": "Jing Peng, Junhao Du, Chenghao Wang, Hanqi Li, Yi Yang, Yixuan Wang, Xiaoyu Gu, Guanyu Chen, et al.",
    "GitHub_Link": "",
    "Paper_Link": "https://arxiv.org/abs/2605.30899",
    "HF_Link": "",
    "Demo_Link": "",
    "Other_Link": "",
    "Audio_Input": "Yes",
    "Audio_Output": "No",
    "Language": "",
    "Description": "SURE is a unified experimentation framework for speech understanding that standardizes prediction formats, normalization, and scoring to make evaluations comparable across paradigms, from conventional pipelines to Speech LLMs. It adds an agent-assisted training-conversion flow that maps papers and code into versioned, runnable training pipelines on matched open-data subsets."
  },
  {
    "Category": "Audio Generation",
    "Type": "Model",
    "Abbreviation": "SwanSphere",
    "Title": "Towards Streaming Synchronized Spatial Audio Generation via Autoregressive Diffusion Transformer",
    "Time": "2026-05",
    "Affiliation": "",
    "Author": "Ke Lei, Yu Zhang, Changhao Pan, Xueyi Pu, Wenxiang Guo, Ruiqi Li, Zhou Zhao",
    "GitHub_Link": "",
    "Paper_Link": "https://arxiv.org/abs/2605.30940",
    "HF_Link": "",
    "Demo_Link": "",
    "Other_Link": "",
    "Audio_Input": "No",
    "Audio_Output": "Yes",
    "Language": "",
    "Description": "SwanSphere is a unified streaming framework for high-fidelity spatial audio generation from panoramic videos and text prompts. It uses a causal autoregressive diffusion transformer for low-latency streaming synthesis and a Spatial Video-Audio Contrastive (SVAC) learning strategy to align the video encoder with acoustic spatial cues."
  },
  {
    "Category": "Audio Generation",
    "Type": "Model",
    "Abbreviation": "UNISON",
    "Title": "UNISON: A Unified Sound Generation and Editing Framework via Deep LLM Fusion",
    "Time": "2026-05",
    "Affiliation": "",
    "Author": "Zhaoqing Li, Haoning Xu, Jingran Su, Yaofang Liu, Zhefan Rao, Huimeng Wang, Jiajun Deng, Tianzi Wang, et al.",
    "GitHub_Link": "",
    "Paper_Link": "https://arxiv.org/abs/2605.31530",
    "HF_Link": "",
    "Demo_Link": "",
    "Other_Link": "",
    "Audio_Input": "Yes",
    "Audio_Output": "Yes",
    "Language": "",
    "Description": "UNISON is a latent diffusion framework that unifies speech generation, sound generation, and audio editing in a single set of weights, covering text-to-audio, text-to-speech, zero-shot speaker cloning, mixed speech-and-sound generation, and scene-level/timed editing. It uses layer-wise deep LLM fusion, injecting hidden states from a frozen MLLM into corresponding MM-DiT blocks for depth-matched semantic conditioning."
  },
  {
    "Category": "Model and Methods",
    "Type": "Speech Tokenizer",
    "Abbreviation": "UniAudio-Token",
    "Title": "UniAudio-Token: Empowering Semantic Speech Tokenizers with General Audio Perception",
    "Time": "2026-05",
    "Affiliation": "",
    "Author": "Yuhan Song, Linhao Zhang, Aiwei Liu, Chuhan Wu, Sijun Zhang, Wei Jia, Yuan Liu, Houfeng Wang, et al.",
    "GitHub_Link": "",
    "Paper_Link": "https://arxiv.org/abs/2605.31521",
    "HF_Link": "",
    "Demo_Link": "",
    "Other_Link": "",
    "Audio_Input": "Yes",
    "Audio_Output": "No",
    "Language": "",
    "Description": "UniAudio-Token augments single-codebook semantic speech tokenizers with general audio perception without sacrificing speech ability, addressing the 'acoustic blindness' of linguistically-focused tokenizers. It introduces Semantic-Acoustic Primitives (SAP) that decompose audio into linguistic content, vocal attributes, and auditory-scene primitives, plus a content-aware Semantic-Acoustic Equilibrium (SAE) gating mechanism."
  },
  {
    "Category": "Multimodal",
    "Type": "Omni-Modal LLM",
    "Abbreviation": "MiniCPM-o",
    "Title": "MiniCPM-o 4.5: Towards Real-Time Full-Duplex Omni-Modal Interaction",
    "Time": "2026-04",
    "Affiliation": "ModelBest (OpenBMB), Tsinghua University",
    "Author": "Junbo Cui, Bokai Xu, Chongyi Wang, Tianyu Yu, Weiyue Sun, Yingjing Xu, et al.",
    "GitHub_Link": "https://github.com/OpenBMB/MiniCPM-o",
    "Paper_Link": "https://arxiv.org/abs/2604.27393",
    "HF_Link": "https://huggingface.co/openbmb/MiniCPM-o-2_6",
    "Demo_Link": "",
    "Other_Link": "",
    "Audio_Input": "Yes",
    "Audio_Output": "Yes",
    "Language": "Multilingual",
    "Description": "MiniCPM-o 4.5 is OpenBMB's compact (8B-class) full-duplex omni-modal LLM supporting real-time vision, speech, and text interaction with low-latency streaming TTS, designed for on-device and edge deployment."
  },
  {
    "Category": "Model and Methods",
    "Type": "Speech Recognition Model",
    "Abbreviation": "Fun-ASR-Nano",
    "Title": "Fun-ASR Technical Report",
    "Time": "2025-12",
    "Affiliation": "FunAudioLLM Team, Tongyi Lab, Alibaba Group",
    "Author": "FunAudioLLM Team",
    "GitHub_Link": "https://github.com/FunAudioLLM/Fun-ASR",
    "Paper_Link": "https://arxiv.org/abs/2509.12508",
    "HF_Link": "https://huggingface.co/FunAudioLLM/Fun-ASR-Nano-2512",
    "Demo_Link": "https://huggingface.co/spaces/FunAudioLLM/Fun-ASR-Nano",
    "Other_Link": "",
    "Audio_Input": "Yes",
    "Audio_Output": "No",
    "Language": "Multilingual",
    "Description": "End-to-end LLM-based ASR (SenseVoice Encoder + Transformer Adaptor + Qwen3-0.6B LLM + CTC Decoder) from the FunAudioLLM team. Trained on tens of millions of hours of real speech, supports 31 languages, 7 Chinese dialects, 26 regional accents, lyrics recognition, hotwords, timestamps, and speaker diarization. Streaming inference accelerated via vLLM (up to 393x realtime)."
  },
  {
    "Category": "Model and Methods",
    "Type": "Omni-Modal LLM",
    "Abbreviation": "Qwen3-Omni",
    "Title": "Qwen3-Omni Technical Report",
    "Time": "2025-09",
    "Affiliation": "Qwen Team, Alibaba Group",
    "Author": "Qwen Team (Jin Xu, Zhifang Guo, Hangrui Hu, Yunfei Chu, Xiong Wang, Jinzheng He, Yuxuan Wang, et al.)",
    "GitHub_Link": "https://github.com/QwenLM/Qwen3-Omni",
    "Paper_Link": "https://arxiv.org/abs/2509.17765",
    "HF_Link": "https://huggingface.co/collections/Qwen/qwen3-omni",
    "Demo_Link": "",
    "Other_Link": "",
    "Audio_Input": "Yes",
    "Audio_Output": "Yes",
    "Language": "Multilingual",
    "Description": "Qwen3-Omni is the third-generation omni-modal LLM from Alibaba, scaling up the Thinker-Talker design with stronger multilingual ASR, audio understanding, and real-time speech generation across 100+ input and 30+ output languages."
  },
  {
    "Category": "Model and Methods",
    "Type": "Model",
    "Abbreviation": "ACORN",
    "Title": "Teaching Physical Awareness to LLMs through Sounds",
    "Time": "2025-07",
    "Affiliation": "NIO",
    "Author": "Weiguo Wang, Andy Nie, Wenrui Zhou, Yi Kai, Chengchen Hu",
    "GitHub_Link": "",
    "Paper_Link": "https://arxiv.org/abs/2506.08524",
    "HF_Link": "",
    "Demo_Link": "",
    "Other_Link": "https://icml.cc/virtual/2025/poster/46139",
    "Audio_Input": "Yes",
    "Audio_Output": "No",
    "Language": "",
    "Description": "ACORN explores and validates the feasibility of teaching LLMs to understand the physical world through sounds."
  },
  {
    "Category": "Model and Methods",
    "Type": "Audio LLM",
    "Abbreviation": "Audio Flamingo 3",
    "Title": "Audio Flamingo 3: Advancing Audio Intelligence with Fully Open Large Audio Language Models",
    "Time": "2025-07",
    "Affiliation": "NVIDIA",
    "Author": "Arushi Goel, Sreyan Ghosh, Jaehyeon Kim, Sonal Kumar, Zhifeng Kong, Sang-gil Lee, Chao-Han Huck Yang, Ramani Duraiswami, Dinesh Manocha, Rafael Valle, Bryan Catanzaro",
    "GitHub_Link": "https://github.com/NVIDIA/audio-flamingo",
    "Paper_Link": "https://arxiv.org/abs/2507.08128",
    "HF_Link": "https://huggingface.co/nvidia/audio-flamingo-3",
    "Demo_Link": "https://research.nvidia.com/labs/adlr/AF3/",
    "Other_Link": "",
    "Audio_Input": "Yes",
    "Audio_Output": "No",
    "Language": "Multilingual",
    "Description": "Audio Flamingo 3 (AF3) is the third generation of NVIDIA's fully-open audio LLM, supporting longer audio context (up to ~10 min), think-then-answer reasoning, and stronger multilingual coverage. Training data, weights, and recipes are all released."
  },
  {
    "Category": "Model and Methods",
    "Type": "Model",
    "Abbreviation": "DIFFA",
    "Title": "DIFFA: Large Language Diffusion Models Can Listen and Understand",
    "Time": "2025-07",
    "Affiliation": "Nankai University (NKU-HLT)",
    "Author": "Jiaming Zhou, Hongjie Chen, Shiwan Zhao, Jian Kang, Jie Li, Enzhi Wang, Yujie Guo, Haoqin Sun, Hui Wang, Aobo Kong, Yong Qin, Xuelong Li",
    "GitHub_Link": "https://github.com/NKU-HLT/DIFFA",
    "Paper_Link": "https://arxiv.org/abs/2507.18452",
    "HF_Link": "",
    "Demo_Link": "",
    "Other_Link": "",
    "Audio_Input": "Yes",
    "Audio_Output": "No",
    "Language": "Multilingual",
    "Description": "DIFFA explores whether large language diffusion models (rather than autoregressive LLMs) can be adapted to listen to and understand audio, building an audio-conditioned diffusion language model and showing it can match autoregressive counterparts on audio understanding tasks."
  },
  {
    "Category": "Chatbot",
    "Type": "Spoken Dialogue Model",
    "Abbreviation": "OpenS2S",
    "Title": "OpenS2S: Advancing Fully Open-Source End-to-End Empathetic Large Speech Language Model",
    "Time": "2025-07",
    "Affiliation": "CASIA",
    "Author": "Chen Wang, Tianyu Peng, Wen Yang, Yinan Bai, Guangfu Wang, Jun Lin, Lanpeng Jia, et al.",
    "GitHub_Link": "https://github.com/CASIA-LM/OpenS2S",
    "Paper_Link": "https://arxiv.org/abs/2507.05177",
    "HF_Link": "",
    "Demo_Link": "",
    "Other_Link": "",
    "Audio_Input": "Yes",
    "Audio_Output": "Yes",
    "Language": "Multilingual",
    "Description": "OpenS2S is a fully open-source end-to-end empathetic speech-to-speech LLM, releasing training data, training recipe, and model weights, with explicit attention to paralinguistic empathy in spoken dialogue."
  },
  {
    "Category": "Model and Methods",
    "Type": "Audio LLM",
    "Abbreviation": "Step-Audio 2",
    "Title": "Step-Audio 2 Technical Report",
    "Time": "2025-07",
    "Affiliation": "Step-Audio Team, StepFun",
    "Author": "Step-Audio Team (Boyong Wu, Chao Yan, Chen Hu, Cheng Yi, Chengli Feng, Fei Tian, Feiyu Shen, Gang Li, et al.)",
    "GitHub_Link": "https://github.com/stepfun-ai/Step-Audio2",
    "Paper_Link": "https://arxiv.org/abs/2507.16632",
    "HF_Link": "https://huggingface.co/stepfun-ai",
    "Demo_Link": "",
    "Other_Link": "",
    "Audio_Input": "Yes",
    "Audio_Output": "Yes",
    "Language": "Multilingual",
    "Description": "Step-Audio 2 is the successor to Step-Audio, scaling the unified speech understanding-and-generation LLM with stronger emotion, paralinguistics, and real-time interaction. Supports both bilingual (Chinese / English) and multilingual end-to-end speech dialogue."
  },
  {
    "Category": "Speech Recognition",
    "Type": "Audio Understanding Model",
    "Abbreviation": "Voxtral",
    "Title": "Voxtral",
    "Time": "2025-07",
    "Affiliation": "Mistral AI",
    "Author": "Mistral AI (Alexander H. Liu, Andy Ehrenberg, Andy Lo, Clément Denoix, Corentin Barreau, et al.)",
    "GitHub_Link": "",
    "Paper_Link": "https://arxiv.org/abs/2507.13264",
    "HF_Link": "https://huggingface.co/mistralai/Voxtral-Mini-3B-2507",
    "Demo_Link": "",
    "Other_Link": "https://mistral.ai/news/voxtral/",
    "Audio_Input": "Yes",
    "Audio_Output": "No",
    "Language": "Multilingual",
    "Description": "Voxtral is Mistral AI's open audio LLM family (3B and 24B) for speech transcription, multilingual understanding, and Q&A over long-form audio — released with permissive weights and competitive performance against closed-source ASR systems."
  },
  {
    "Category": "Benchmark",
    "Type": "Benchmark",
    "Abbreviation": "CMI-Bench",
    "Title": "CMI-Bench: A Comprehensive Benchmark for Evaluating Music Instruction Following",
    "Time": "2025-06",
    "Affiliation": "Queen Mary University of London",
    "Author": "Yinghao Ma, Siyou Li, Juntao Yu, Emmanouil Benetos, Akira Maezawa",
    "GitHub_Link": "https://github.com/nicolaus625/CMI-bench",
    "Paper_Link": "https://arxiv.org/abs/2506.12285",
    "HF_Link": "https://huggingface.co/papers/2506.12285",
    "Demo_Link": "",
    "Other_Link": "",
    "Audio_Input": "Yes",
    "Audio_Output": "No",
    "Language": "",
    "Description": "This work presents CMI-Bench, a benchmark that evaluates audio-text LLMs on diverse music tasks by reformatting traditional MIR annotations into instruction-following formats. It highlights performance gaps and biases, offering a foundation for improving music-aware LLMs."
  },
  {
    "Category": "Model and Methods",
    "Type": "Model",
    "Abbreviation": "PAL",
    "Title": "PAL: Probing Audio Encoders via LLMs - A Study of Information Transfer from Audio Encoders to LLMs",
    "Time": "2025-06",
    "Affiliation": "CVSSP,PAI@University of Surrey UK, MBZUAI Abu Dhabi",
    "Author": "Tony Alex, Wish Suharitdamrong, Sara Atito, Armin Mustafa, Philip J. B. Jackson, Imran Razzak, Muhammad Awais",
    "GitHub_Link": "https://github.com/ta012/PAL-AudioLLM",
    "Paper_Link": "https://arxiv.org/abs/2506.10423",
    "HF_Link": "",
    "Demo_Link": "",
    "Other_Link": "https://ta012.github.io/PAL/",
    "Audio_Input": "Yes",
    "Audio_Output": "",
    "Language": "Multilingual",
    "Description": "PAL investigates and explores strategies for integrating audio encoders with LLMs, focusing on efficient cross-modal information transfer. Guided by hypotheses derived from mechanistic interpretability studies and the operational principles of LLMs."
  },
  {
    "Category": "Speech Synthesis",
    "Type": "TTS Model",
    "Abbreviation": "CosyVoice 3",
    "Title": "CosyVoice 3: Towards In-the-wild Speech Generation via Scaling-up and Post-training",
    "Time": "2025-05",
    "Affiliation": "FunAudioLLM Team, Tongyi Lab, Alibaba Group",
    "Author": "Zhihao Du, Changfeng Gao, Yuxuan Wang, Fan Yu, Tianyu Zhao, Hao Wang, Xiang Lv, Hui Wang, Xian Shi, Keyu An, Guanrou Yang, Yabin Li, Yanni Chen, Zhifu Gao, Qian Chen, Yue Gu, Mengzhe Chen, Yafeng Chen, Shiliang Zhang, Wen Wang, Jieping Ye",
    "GitHub_Link": "https://github.com/FunAudioLLM/CosyVoice",
    "Paper_Link": "https://arxiv.org/abs/2505.17589",
    "HF_Link": "https://huggingface.co/FunAudioLLM",
    "Demo_Link": "https://funaudiollm.github.io/cosyvoice3/",
    "Other_Link": "",
    "Audio_Input": "Yes",
    "Audio_Output": "Yes",
    "Language": "Multilingual",
    "Description": "CosyVoice 3 scales the CosyVoice TTS stack with significantly larger pre-training data and a dedicated post-training stage, targeting in-the-wild speech generation across more languages, accents, and acoustic conditions."
  },
  {
    "Category": "Benchmark",
    "Type": "Benchmark",
    "Abbreviation": "LALM-Temporal-Bench",
    "Title": "Benchmarking and Confidence Evaluation of LALMs For Temporal Reasoning",
    "Time": "2025-05",
    "Affiliation": "Indian Institute of Science (IISc), Bangalore",
    "Author": "Debarpan Bhattacharya, Apoorva Kulkarni, Sriram Ganapathy",
    "GitHub_Link": "",
    "Paper_Link": "https://arxiv.org/abs/2505.13115",
    "HF_Link": "",
    "Demo_Link": "",
    "Other_Link": "https://www.isca-archive.org/interspeech_2025/bhattacharya25b_interspeech.pdf",
    "Audio_Input": "Yes",
    "Audio_Output": "No",
    "Language": "English",
    "Description": "An INTERSPEECH 2025 benchmark for evaluating Large Audio-Language Models (LALMs) on temporal reasoning over audio, with an additional analysis of model confidence calibration on these tasks."
  },
  {
    "Category": "Benchmark",
    "Type": "Benchmark",
    "Abbreviation": "MMAR",
    "Title": "MMAR: A Challenging Benchmark for Deep Reasoning in Speech, Audio, Music, and Their Mix",
    "Time": "2025-05",
    "Affiliation": "Shanghai Jiao Tong University",
    "Author": "Ziyang Ma, Yinghao Ma, Yanqiao Zhu, Chen Yang, Yi-Wen Chao, Ruiyang Xu, Wenxi Chen, Yuanzhe Chen, Zhuo Chen, Jian Cong, Kai Li, Keliang Li, Siyou Li, Xinfeng Li, Xiquan Li, Zheng Lian, Yuzhe Liang, Minghao Liu, Zhikang Niu, Tianrui Wang, Yuping Wang, Yuxuan Wang, Yihao Wu, Guanrou Yang, Jianwei Yu, Ruibin Yuan, Zhisheng Zheng, Ziya Zhou, Haina Zhu, Wei Xue, Emmanouil Benetos, Kai Yu, Eng-Siong Chng, Xie Chen",
    "GitHub_Link": "",
    "Paper_Link": "https://arxiv.org/abs/2505.13032",
    "HF_Link": "",
    "Demo_Link": "",
    "Other_Link": "",
    "Audio_Input": "-",
    "Audio_Output": "-",
    "Language": "Multilingual",
    "Description": "MMAR is a challenging benchmark of 1,000 real-world audio QA triplets designed to evaluate deep, multi-layer reasoning in Audio-Language Models across diverse sound, music, and speech tasks, with hierarchical annotations and Chain-of-Thought rationales to drive progress in audio reasoning research."
  },
  {
    "Category": "Model and Methods",
    "Type": "Audio LLM",
    "Abbreviation": "Kimi-Audio",
    "Title": "Kimi-Audio Technical Report",
    "Time": "2025-04",
    "Affiliation": "Moonshot AI",
    "Author": "KimiTeam (Ding Ding, Zeqian Ju, Yichong Leng, Songxiang Liu, Tong Liu, Zeyu Shang, et al.)",
    "GitHub_Link": "https://github.com/MoonshotAI/Kimi-Audio",
    "Paper_Link": "https://arxiv.org/abs/2504.18425",
    "HF_Link": "https://huggingface.co/moonshotai/Kimi-Audio-7B-Instruct",
    "Demo_Link": "",
    "Other_Link": "",
    "Audio_Input": "Yes",
    "Audio_Output": "Yes",
    "Language": "Multilingual",
    "Description": "Kimi-Audio is Moonshot AI's open-source audio foundation model unifying speech understanding, audio understanding, and speech generation in a single LLM, trained on ~13M hours of audio with strong performance on ASR, audio captioning, audio QA, and speech dialogue."
  },
  {
    "Category": "Model and Methods",
    "Type": "Model",
    "Abbreviation": "Audio Flamingo 2",
    "Title": "Audio Flamingo 2: An Audio-Language Model with Long-Audio Understanding and Expert Reasoning Abilities",
    "Time": "2025-03",
    "Affiliation": "NVIDIA, University of Maryland",
    "Author": "Sreyan Ghosh, Zhifeng Kong, Sonal Kumar, S Sakshi, Jaehyeon Kim, Wei Ping, Rafael Valle, Dinesh Manocha, Bryan Catanzaro",
    "GitHub_Link": "https://github.com/NVIDIA/audio-flamingo",
    "Paper_Link": "https://arxiv.org/abs/2503.03983",
    "HF_Link": "https://huggingface.co/nvidia/audio-flamingo-2",
    "Demo_Link": "https://research.nvidia.com/labs/adlr/AF2/",
    "Other_Link": "",
    "Audio_Input": "Yes",
    "Audio_Output": "No",
    "Language": "English",
    "Description": "Audio Flamingo 2 (AF2) is the successor to Audio Flamingo, designed for long-audio understanding (up to 5 minutes) and expert reasoning over non-speech sounds and music. The authors also introduce AudioSkills, LongAudio, and LongAudioBench to support training and evaluation."
  },
  {
    "Category": "Model and Methods",
    "Type": "Model",
    "Abbreviation": "Audio-Reasoner",
    "Title": "Audio-Reasoner: Improving Reasoning Capability in Large Audio Language Models",
    "Time": "2025-03",
    "Affiliation": "Nanyang Technological University, Skywork AI",
    "Author": "Zhifei Xie, Mingbao Lin, Zihang Liu, Pengcheng Wu, Shuicheng Yan, Chunyan Miao",
    "GitHub_Link": "https://github.com/xzf-thu/Audio-Reasoner",
    "Paper_Link": "https://arxiv.org/abs/2503.02318",
    "HF_Link": "",
    "Demo_Link": "https://xzf-thu.github.io/Audio-Reasoner/",
    "Other_Link": "",
    "Audio_Input": "Yes",
    "Audio_Output": "No",
    "Language": "Multilingual",
    "Description": "Audio-Reasoner is a large audio language model designed for deep reasoning over audio. The authors construct CoTA, a 1.2M-sample chain-of-thought dataset for audio tasks, and fine-tune the model to perform structured reasoning on audio understanding benchmarks."
  },
  {
    "Category": "Speech Synthesis",
    "Type": "TTS Model",
    "Abbreviation": "FireRedTTS",
    "Title": "FireRedTTS-1S: An Upgraded Streamable Foundation Text-to-Speech System",
    "Time": "2025-03",
    "Affiliation": "FireRed Team, Xiaohongshu",
    "Author": "Hao-Han Guo, Yao Hu, Fei-Yu Shen, Xu Tang, Yi-Chen Wu, Feng-Long Xie, Kun Xie",
    "GitHub_Link": "https://github.com/FireRedTeam/FireRedTTS",
    "Paper_Link": "https://arxiv.org/abs/2503.20499",
    "HF_Link": "https://huggingface.co/FireRedTeam",
    "Demo_Link": "https://fireredteam.github.io/demos/firered_tts_1s/",
    "Other_Link": "",
    "Audio_Input": "Yes",
    "Audio_Output": "Yes",
    "Language": "Multilingual",
    "Description": "FireRedTTS-1S is Xiaohongshu's streamable foundation TTS, improving streaming latency and prosody control over its predecessor with chunk-wise generation suitable for live voice products."
  },
  {
    "Category": "Benchmark",
    "Type": "Benchmark",
    "Abbreviation": "Full-Duplex-Bench",
    "Title": "Full-Duplex-Bench: A Benchmark to Evaluate Full-duplex Spoken Dialogue Models on Turn-taking Capabilities",
    "Time": "2025-03",
    "Affiliation": "National Taiwan University, UC Berkeley, MIT",
    "Author": "Guan-Ting Lin, Jiachen Lian, Tingle Li, Qirui Wang, Gopala Anumanchipalli, Alexander H. Liu, Hung-yi Lee",
    "GitHub_Link": "https://github.com/DanielLin94144/Full-Duplex-Bench",
    "Paper_Link": "https://arxiv.org/abs/2503.04721",
    "HF_Link": "",
    "Demo_Link": "",
    "Other_Link": "",
    "Audio_Input": "Yes",
    "Audio_Output": "Yes",
    "Language": "English",
    "Description": "Full-Duplex-Bench is a benchmark for evaluating full-duplex spoken dialogue models on real-time interaction phenomena such as turn-taking, pauses, interruptions, and backchanneling — capabilities that traditional half-duplex evaluation cannot cover."
  },
  {
    "Category": "Model and Methods",
    "Type": "Multimodal Language Model",
    "Abbreviation": "Phi-4-Mini",
    "Title": "Phi-4-Mini Technical Report: Compact yet Powerful Multimodal Language Models via Mixture-of-LoRAs",
    "Time": "2025-03",
    "Affiliation": "Microsoft",
    "Author": "Microsoft Phi-4-Mini Team",
    "GitHub_Link": "https://github.com/microsoft/Phi-4",
    "Paper_Link": "https://arxiv.org/abs/2503.01743",
    "HF_Link": "https://huggingface.co/microsoft/Phi-4-multimodal-instruct",
    "Demo_Link": "",
    "Other_Link": "",
    "Audio_Input": "Yes",
    "Audio_Output": "No",
    "Language": "Multilingual",
    "Description": "Phi-4-Mini and Phi-4-Multimodal are compact language and multimodal models from Microsoft. Phi-4-Mini is a 3.8B-parameter LLM; the multimodal variant extends it to vision and speech/audio via a Mixture-of-LoRAs design, achieving competitive results while remaining lightweight."
  },
  {
    "Category": "Model and Methods",
    "Type": "Omni-Modal LLM",
    "Abbreviation": "Qwen2.5-Omni",
    "Title": "Qwen2.5-Omni Technical Report",
    "Time": "2025-03",
    "Affiliation": "Qwen Team, Alibaba Group",
    "Author": "Qwen Team (Jin Xu, Zhifang Guo, Jinzheng He, Hangrui Hu, Ting He, Shuai Bai, Keqin Chen, et al.)",
    "GitHub_Link": "https://github.com/QwenLM/Qwen2.5-Omni",
    "Paper_Link": "https://arxiv.org/abs/2503.20215",
    "HF_Link": "https://huggingface.co/Qwen/Qwen2.5-Omni-7B",
    "Demo_Link": "https://huggingface.co/spaces/Qwen/Qwen2.5-Omni-7B-Demo",
    "Other_Link": "",
    "Audio_Input": "Yes",
    "Audio_Output": "Yes",
    "Language": "Multilingual",
    "Description": "Qwen2.5-Omni is Alibaba's end-to-end omni-modal LLM handling text, image, audio, and video as inputs and producing both text and streaming speech outputs, built on a Thinker-Talker dual-track architecture that decouples reasoning and speech generation."
  },
  {
    "Category": "Dataset Resource",
    "Type": "Dataset Resource",
    "Abbreviation": "Audio-FLAN",
    "Title": "Audio-FLAN: A Preliminary Release",
    "Time": "2025-02",
    "Affiliation": "The Hong Kong University of Science and Technology",
    "Author": "Liumeng Xue, Ziya Zhou, Jiahao Pan, Zixuan Li, Shuai Fan, Yinghao Ma, Sitong Cheng, Dongchao Yang, Haohan Guo, Yujia Xiao, Xinsheng Wang, Zixuan Shen, Chuanbo Zhu, Xinshen Zhang, Tianchi Liu, Ruibin Yuan, Zeyue Tian, Haohe Liu, Emmanouil Benetos, Ge Zhang, Yike Guo, Wei Xue",
    "GitHub_Link": "https://github.com/lmxue/Audio-FLAN",
    "Paper_Link": "https://arxiv.org/abs/2502.16584",
    "HF_Link": "https://huggingface.co/datasets/HKUSTAudio/Audio-FLAN-Dataset",
    "Demo_Link": "",
    "Other_Link": "",
    "Audio_Input": "-",
    "Audio_Output": "-",
    "Language": "English",
    "Description": "Audio-FLAN is a large-scale instruction-tuning dataset with over 100 million instances across 80 tasks in speech, music, and sound, designed to unify audio understanding and generation for developing generalist audio-language models."
  },
  {
    "Category": "Speech Synthesis",
    "Type": "TTS Model",
    "Abbreviation": "IndexTTS",
    "Title": "IndexTTS: An Industrial-Level Controllable and Efficient Zero-Shot Text-To-Speech System",
    "Time": "2025-02",
    "Affiliation": "Bilibili",
    "Author": "Wei Deng, Siyi Zhou, Jingchen Shu, Jinchao Wang, Lu Wang",
    "GitHub_Link": "https://github.com/index-tts/index-tts",
    "Paper_Link": "https://arxiv.org/abs/2502.05512",
    "HF_Link": "https://huggingface.co/IndexTeam",
    "Demo_Link": "https://index-tts.github.io/",
    "Other_Link": "",
    "Audio_Input": "Yes",
    "Audio_Output": "Yes",
    "Language": "Multilingual",
    "Description": "IndexTTS is Bilibili's industrial-grade zero-shot TTS system optimised for production scenarios — controllable prosody, low-latency inference, and strong Chinese and English voice cloning from short reference audio."
  },
  {
    "Category": "Model and Methods",
    "Type": "Model",
    "Abbreviation": "OSUM",
    "Title": "OSUM: Advancing Open Speech Understanding Models with Limited Resources in Academia",
    "Time": "2025-02",
    "Affiliation": "ASLP@NPU",
    "Author": "Xuelong Geng, Kun Wei, Qijie Shao, Shuiyun Liu, Zhennan Lin, Zhixian Zhao, Guojian Li, Wenjie Tian, Peikun Chen, Yangze Li, Pengcheng Guo, Mingchen Shao, Shuiyuan Wang, Yuang Cao, Chengyou Wang, Tianyi Xu, Yuhang Dai, Xinfa Zhu, Yue Li, Li Zhang, Lei Xie",
    "GitHub_Link": "https://github.com/ASLP-lab/OSUM",
    "Paper_Link": "https://arxiv.org/pdf/2501.13306",
    "HF_Link": "https://huggingface.co/spaces/ASLP-lab/OSUM",
    "Demo_Link": "",
    "Other_Link": "",
    "Audio_Input": "Yes",
    "Audio_Output": "No",
    "Language": "Multilingual",
    "Description": "Large Language Models (LLMs) have made significant progress in various downstream tasks, inspiring the development of Speech Understanding Language Models (SULMs) to enable comprehensive speech-based interactions. However, most advanced SULMs are developed by the industry, leveraging large-scale datasets and computational resources that are not readily available to the academic community. Moreover, the lack of transparency in training details creates additional barriers to further innovation. In this study, we present OSUM, an Open Speech Understanding Model designed to explore the potential of training SLUMs under constrained academic resources. The OSUM model combines a Whisper encoder with a Qwen2 LLM and supports a wide range of speech tasks, including speech recognition (ASR), speech recognition with timestamps (SRWT), vocal event detection (VED), speech emotion recognition (SER), speaking style recognition (SSR), speaker gender classification (SGC), speaker age prediction (SAP), and speech-to-text chat (STTC). By employing an ASR+X training strategy, OSUM achieves efficient and stable multi-task training by simultaneously optimizing ASR alongside target tasks. Beyond delivering strong performance, OSUM emphasizes transparency by providing openly available data preparation and training methodologies, offering valuable insights and practical guidance for the academic community. By doing so, we aim to accelerate research and innovation in advanced SULM technologies."
  },
  {
    "Category": "Model and Methods",
    "Type": "Model",
    "Abbreviation": "OWLS",
    "Title": "OWLS: Scaling Laws for Multilingual Speech Recognition and Translation Models",
    "Time": "2025-02",
    "Affiliation": "Carnegie Mellon University, NVIDIA",
    "Author": "William Chen, Jinchuan Tian, Yifan Peng, Brian Yan, Chao-Han Huck Yang, Shinji Watanabe",
    "GitHub_Link": "",
    "Paper_Link": "https://arxiv.org/abs/2502.10373",
    "HF_Link": "https://huggingface.co/collections/espnet/owls-67ab7e5a9b8ee8a31bd9aab9",
    "Demo_Link": "",
    "Other_Link": "",
    "Audio_Input": "Yes",
    "Audio_Output": "No",
    "Language": "Multilingual",
    "Description": "OWLS systematically studies neural scaling laws for multilingual speech recognition and translation models, training a suite of models from 0.25B to 18B parameters on up to 360K hours of public speech data across 150+ languages to characterise how performance scales with data, compute, and parameter count."
  },
  {
    "Category": "Model and Methods",
    "Type": "Model",
    "Abbreviation": "Step-Audio",
    "Title": "Step-Audio: Unified Understanding and Generation in Intelligent Speech Interaction",
    "Time": "2025-02",
    "Affiliation": "Step-Audio Team, StepFun",
    "Author": "Ailin Huang, Boyong Wu, Bruce Wang, Chao Yan, Chen Hu, Chengli Feng, Fei Tian, Feiyu Shen, Jingbei Li, Mingrui Chen, Peng Liu, Ruihang Miao, Wang You, Xi Chen, Xuerui Yang, Yechang Huang, Yuxiang Zhang, Zheng Gong, Zixin Zhang, Hongyu Zhou, Jianjian Sun, Brian Li, Chengting Feng, Changyi Wan, Hanpeng Hu, Jianchang Wu, Jiangjie Zhen, Ranchen Ming, Song Yuan, Xuelin Zhang, Yu Zhou, Bingxin Li, Buyun Ma, Hongyuan Wang, Kang An, Wei Ji, Wen Li, Xuan Wen, Xiangwen Kong, Yuankai Ma, Yuanwei Liang, Yun Mou, Bahtiyar Ahmidi, Bin Wang, Bo Li, Changxin Miao, Chen Xu, Chenrun Wang, Dapeng Shi, Deshan Sun, Dingyuan Hu, Dula Sai, Enle Liu, Guanzhe Huang, Gulin Yan, Heng Wang, Haonan Jia, Haoyang Zhang, Jiahao Gong, Junjing Guo, Jiashuai Liu, Jiahong Liu, Jie Feng, Jie Wu, Jiaoren Wu, Jie Yang, Jinguo Wang, Jingyang Zhang, Junzhe Lin, Kaixiang Li, Lei Xia, Li Zhou, Liang Zhao, Longlong Gu, Mei Chen, Menglin Wu, Ming Li, Mingxiao Li, Mingliang Li, Mingyao Liang, Na Wang, Nie Hao, Qiling Wu, Qinyuan Tan, Ran Sun, Shuai Shuai, Shaoliang Pang, Shiliang Yang, Shuli Gao, Shanshan Yuan, Siqi Liu, Shihong Deng, Shilei Jiang, Sitong Liu, Tiancheng Cao, Tianyu Wang, Wenjin Deng, Wuxun Xie, Weipeng Ming, Wenqing He , Wen Sun, Xin Han, Xin Huang, Xiaomin Deng, Xiaojia Liu, Xin Wu, Xu Zhao, Yanan Wei, Yanbo Yu, Yang Cao, Yangguang Li, Yangzhen Ma, Yanming Xu, Yaoyu Wang, Yaqiang Shi, Yilei Wang, Yizhuang Zhou, Yinmin Zhong, Yang Zhang, Yaoben Wei, Yu Luo, Yuanwei Lu, Yuhe Yin, Yuchu Luo, Yuanhao Ding, Yuting Yan, Yaqi Dai, Yuxiang Yang, Zhe Xie, Zheng Ge, Zheng Sun, Zhewei Huang, Zhichao Chang, Zhisheng Guan, Zidong Yang, Zili Zhang, Binxing Jiao, Daxin Jiang, Heung-Yeung Shum, Jiansheng Chen, Jing Li, Shuchang Zhou, Xiangyu Zhang, Xinhao Zhang, Yibo Zhu",
    "GitHub_Link": "https://github.com/stepfun-ai/Step-Audio",
    "Paper_Link": "https://arxiv.org/abs/2502.11946",
    "HF_Link": "https://huggingface.co/stepfun-ai/Step-Audio-Chat",
    "Demo_Link": "",
    "Other_Link": "",
    "Audio_Input": "Yes",
    "Audio_Output": "Yes",
    "Language": "Multilingual",
    "Description": "Real-time speech interaction, serving as a fundamental interface for human-machine collaboration, holds immense potential. However, current open-source models face limitations such as high costs in voice data collection, weakness in dynamic control, and limited intelligence. To address these challenges, this paper introduces Step-Audio, the first production-ready open-source solution. Key contributions include: 1) a 130B-parameter unified speech-text multi-modal model that achieves unified understanding and generation, with the Step-Audio-Chat version open-sourced; 2) a generative speech data engine that establishes an affordable voice cloning framework and produces the open-sourced lightweight Step-Audio-TTS-3B model through distillation; 3) an instruction-driven fine control system enabling dynamic adjustments across dialects, emotions, singing, and RAP; 4) an enhanced cognitive architecture augmented with tool calling and role-playing abilities to manage complex tasks effectively. Based on our new StepEval-Audio-360 evaluation benchmark, Step-Audio achieves state-of-the-art performance in human evaluations, especially in terms of instruction following. On open-source benchmarks like LLaMA Question, shows 9.3% average performance improvement, demonstrating our commitment to advancing the development of open-source multi-modal language technologies."
  },
  {
    "Category": "Model and Methods",
    "Type": "Model",
    "Abbreviation": "Audio-CoT",
    "Title": "Audio-CoT: Exploring Chain-of-Thought Reasoning in Large Audio Language Model",
    "Time": "2025-01",
    "Affiliation": "Nanyang Technological University, Singapore",
    "Author": "Ziyang Ma, Zhuo Chen, Yuping Wang, Eng Siong Chng, Xie Chen",
    "GitHub_Link": "",
    "Paper_Link": "https://arxiv.org/abs/2501.07246",
    "HF_Link": "",
    "Demo_Link": "",
    "Other_Link": "",
    "Audio_Input": "Yes",
    "Audio_Output": "No",
    "Language": "English",
    "Description": "Large Audio-Language Models (LALMs) have demonstrated remarkable performance in tasks involving audio perception and understanding, such as speech recognition and audio captioning. However, their reasoning capabilities - critical for solving complex real-world problems - remain underexplored. In this work, we conduct the first exploration into integrating Chain-of-Thought (CoT) reasoning into LALMs to enhance their reasoning ability across auditory modalities. We evaluate representative CoT methods, analyzing their performance in both information extraction and reasoning tasks across sound, music, and speech domains. Our findings reveal that CoT methods significantly improve performance on easy and medium tasks but encounter challenges with hard tasks, where reasoning chains can confuse the model rather than improve accuracy. Additionally, we identify a positive correlation between reasoning path length and accuracy, demonstrating the potential of scaling inference for advanced instruction-following and reasoning. This study not only highlights the promise of CoT in enhancing LALM reasoning capabilities but also identifies key limitations and provides actionable directions for future research."
  },
  {
    "Category": "Model and Methods",
    "Type": "Model",
    "Abbreviation": "LUCY",
    "Title": "LUCY: Linguistic Understanding and Control Yielding Early Stage of Her",
    "Time": "2025-01",
    "Affiliation": "Tencent",
    "Author": "Heting Gao, Hang Shao, Xiong Wang, Chaofan Qiu, Yunhang Shen, Siqi Cai, Yuchen Shi, Zihan Xu, Zuwei Long, Yike Zhang, Shaoqi Dong, Chaoyou Fu, Ke Li, Long Ma, Xing Sun",
    "GitHub_Link": "https://github.com/VITA-MLLM/LUCY",
    "Paper_Link": "https://arxiv.org/abs/2501.16327",
    "HF_Link": "",
    "Demo_Link": "",
    "Other_Link": "",
    "Audio_Input": "Yes",
    "Audio_Output": "Yes",
    "Language": "English",
    "Description": "The film Her features Samantha, a sophisticated AI audio agent who is capable of understanding both linguistic and paralinguistic information in human speech and delivering real-time responses that are natural, informative and sensitive to emotional subtleties. Moving one step toward more sophisticated audio agent from recent advancement in end-to-end (E2E) speech systems, we propose LUCY, a E2E speech model that (1) senses and responds to user's emotion, (2) deliver responses in a succinct and natural style, and (3) use external tool to answer real-time inquiries. Experiment results show that LUCY is better at emotion control than peer models, generating emotional responses based on linguistic emotional instructions and responding to paralinguistic emotional cues. Lucy is also able to generate responses in a more natural style, as judged by external language models, without sacrificing much performance on general question answering. Finally, LUCY can leverage function calls to answer questions that are out of its knowledge scope."
  },
  {
    "Category": "Chatbot",
    "Type": "Multimodal Large Language Model",
    "Abbreviation": "MinMo",
    "Title": "MinMo: A Multimodal Large Language Model for Seamless Voice Interaction",
    "Time": "2025-01",
    "Affiliation": "FunAudioLLM Team, Tongyi Lab, Alibaba Group",
    "Author": "Qian Chen, Yafeng Chen, Yanni Chen, Mengzhe Chen, Yingda Chen, Chong Deng, Zhihao Du, Ruize Gao, Changfeng Gao, Zhifu Gao, Yabin Li, Xiang Lv, Jiaqing Liu, Haoneng Luo, Bin Ma, Chongjia Ni, Xian Shi, Jialong Tang, Hui Wang, Hao Wang, Wen Wang, Yuxuan Wang, Yunlan Xu, Fan Yu, Zhijie Yan, Yexin Yang, Baosong Yang, Xian Yang, Guanrou Yang, Tianyu Zhao, Qinglin Zhang, Shiliang Zhang, Nan Zhao, Pei Zhang, Chong Zhang, Jinren Zhou",
    "GitHub_Link": "",
    "Paper_Link": "https://arxiv.org/abs/2501.06282",
    "HF_Link": "",
    "Demo_Link": "",
    "Other_Link": "https://funaudiollm.github.io/minmo",
    "Audio_Input": "Yes",
    "Audio_Output": "Yes",
    "Language": "Multilingual",
    "Description": "MinMo is a multimodal large language model with approximately 8 billion parameters, designed for seamless voice interaction. It facilitates real-time, natural, and human-like voice conversations by integrating speech and text processing. Trained on 1.4 million hours of diverse speech data, MinMo supports full-duplex communication, enabling simultaneous two-way interactions between the user and the system. It also offers enhanced instruction-following capabilities, allowing control over speech generation with nuances such as emotions, dialects, speaking rates, and voice mimicry. The model achieves state-of-the-art performance across various benchmarks for voice comprehension and generation while maintaining the capabilities of text-based large language models."
  },
  {
    "Category": "Model and Methods",
    "Type": "Infrastructure",
    "Abbreviation": "Sayna",
    "Title": "Sayna: Voice Infrastructure for Audio LLM Applications",
    "Time": "2025-01",
    "Affiliation": "SaynaAI",
    "Author": "",
    "GitHub_Link": "https://github.com/SaynaAI/sayna",
    "Paper_Link": "",
    "HF_Link": "",
    "Demo_Link": "",
    "Other_Link": "https://docs.sayna.ai/",
    "Audio_Input": "Yes",
    "Audio_Output": "Yes",
    "Language": "Multilingual",
    "Description": "Sayna is a real-time voice infrastructure platform for building production voice-enabled LLM agents. It provides a unified API layer for STT/TTS with real-time streaming, multi-provider support, VAD, and voice analytics. Built with Rust and LiveKit, it offers low-latency WebSocket connections and REST endpoints for seamless voice-first experiences. Self-hostable with Docker and Kubernetes support."
  },
  {
    "Category": "Benchmark",
    "Type": "Benchmark",
    "Abbreviation": "UltraEval-Audio",
    "Title": "UltraEval-Audio",
    "Time": "2025-01",
    "Affiliation": "OpenBMB",
    "Author": "OpenBMB",
    "GitHub_Link": "https://github.com/OpenBMB/UltraEval-Audio",
    "Paper_Link": "",
    "HF_Link": "",
    "Demo_Link": "",
    "Other_Link": "",
    "Audio_Input": "-",
    "Audio_Output": "-",
    "Language": "Multilingual",
    "Description": "UltraEval-Audio"
  },
  {
    "Category": "Benchmark",
    "Type": "Benchmark",
    "Abbreviation": "ADU-Bench",
    "Title": "Benchmarking Open-ended Audio Dialogue Understanding for Large Audio-Language Models",
    "Time": "2024-12",
    "Affiliation": "Tsinghua University, University of Oxford",
    "Author": "Kuofeng Gao, Shu-Tao Xia, Ke Xu, Philip Torr, Jindong Gu",
    "GitHub_Link": "",
    "Paper_Link": "https://arxiv.org/abs/2412.05167",
    "HF_Link": "",
    "Demo_Link": "",
    "Other_Link": "",
    "Audio_Input": "Yes",
    "Audio_Output": "No",
    "Language": "",
    "Description": "ADU-Bench is a comprehensive evaluation benchmark designed to assess the open-ended audio dialogue understanding capabilities of Large Audio-Language Models (LALMs). It comprises over 20,000 open-ended audio dialogues across various scenarios, skills, languages, and ambiguity categories, providing a robust framework for evaluating and advancing LALMs in real-world audio dialogue applications."
  },
  {
    "Category": "Speech Synthesis",
    "Type": "TTS Model",
    "Abbreviation": "CosyVoice 2",
    "Title": "CosyVoice 2: Scalable Streaming Speech Synthesis with Large Language Models",
    "Time": "2024-12",
    "Affiliation": "FunAudioLLM Team, Tongyi Lab, Alibaba Group",
    "Author": "Zhihao Du, Yuxuan Wang, Qian Chen, Xian Shi, Xiang Lv, Tianyu Zhao, Zhifu Gao, Yexin Yang, Changfeng Gao, Hui Wang, Fan Yu, Huadai Liu, Zhengyan Sheng, Yue Gu, Chong Deng, Wen Wang, Shiliang Zhang, Zhijie Yan, Jingren Zhou",
    "GitHub_Link": "https://github.com/FunAudioLLM/CosyVoice",
    "Paper_Link": "https://arxiv.org/abs/2412.10117",
    "HF_Link": "https://huggingface.co/FunAudioLLM/CosyVoice2-0.5B",
    "Demo_Link": "https://funaudiollm.github.io/cosyvoice2/",
    "Other_Link": "",
    "Audio_Input": "Yes",
    "Audio_Output": "Yes",
    "Language": "Multilingual",
    "Description": "CosyVoice 2 is Alibaba's streaming TTS LLM, combining a unified speech tokenizer with a streaming-friendly LLM backbone to enable bidirectional streaming with sub-150 ms latency and improved cross-lingual zero-shot voice cloning."
  },
  {
    "Category": "Chatbot",
    "Type": "Spoken Dialogue Model",
    "Abbreviation": "GLM-4-Voice",
    "Title": "GLM-4-Voice: Towards Intelligent and Human-Like End-to-End Spoken Chatbot",
    "Time": "2024-12",
    "Affiliation": "Zhipu AI, Tsinghua University",
    "Author": "Aohan Zeng, Zhengxiao Du, Mingdao Liu, Kedong Wang, Shengmin Jiang, Lei Zhao, Yuxiao Dong, Jie Tang",
    "GitHub_Link": "https://github.com/THUDM/GLM-4-Voice",
    "Paper_Link": "https://arxiv.org/abs/2412.02612",
    "HF_Link": "https://huggingface.co/THUDM/glm-4-voice-9b",
    "Demo_Link": "",
    "Other_Link": "",
    "Audio_Input": "Yes",
    "Audio_Output": "Yes",
    "Language": "Bilingual (Chinese and English)",
    "Description": "GLM-4-Voice is an end-to-end spoken chatbot from Zhipu/Tsinghua that takes speech in and produces speech out directly, supporting low-latency streaming and natural Chinese / English conversation with controllable emotion, pitch, and speaking rate."
  },
  {
    "Category": "Model and Methods",
    "Type": "Model",
    "Abbreviation": "MERaLiON-AudioLLM",
    "Title": "MERaLiON-AudioLLM: Bridging Audio and Language with Large Language Models",
    "Time": "2024-12",
    "Affiliation": "I2R, A*STAR, Singapore",
    "Author": "Yingxu He, Zhuohan Liu, Shuo Sun, Bin Wang, Wenyu Zhang, Xunlong Zou, Nancy F. Chen, Ai Ti Aw",
    "GitHub_Link": "",
    "Paper_Link": "https://arxiv.org/abs/2412.09818",
    "HF_Link": "https://huggingface.co/MERaLiON/MERaLiON-AudioLLM-Whisper-SEA-LION",
    "Demo_Link": "https://huggingface.co/spaces/MERaLiON/MERaLiON-AudioLLM",
    "Other_Link": "",
    "Audio_Input": "Yes",
    "Audio_Output": "No",
    "Language": "Multilingual",
    "Description": ""
  },
  {
    "Category": "Benchmark",
    "Type": "Interactive Benchmarking Tool",
    "Abbreviation": "TalkArena",
    "Title": "TalkArena: Interactive Evaluation of Large Audio Models",
    "Time": "2024-12",
    "Affiliation": "Stanford University, SCB 10X",
    "Author": "Ella Minzhi Li*, Will Held*, Michael J. Ryan, Kunat Pipatanakul, Potsawee Manakul, Hao Zhu, Diyi Yang (*Equal Contribution)",
    "GitHub_Link": "https://github.com/SALT-NLP/talk-arena",
    "Paper_Link": "",
    "HF_Link": "",
    "Demo_Link": "https://talkarena.org/",
    "Other_Link": "https://talkarena.org/blog",
    "Audio_Input": "Yes",
    "Audio_Output": "No",
    "Language": "English",
    "Description": "TalkArena is an interactive platform designed to benchmark Large Audio Models (AudioLLMs) through real-world user interactions. Similar to Chatbot Arena for text-based models, TalkArena allows users to input audio prompts and receive text-based responses from various state-of-the-art models, facilitating pairwise comparisons and user preference evaluations. The platform supports models such as GPT-4o, Gemini, Qwen2-Audio, DiVA-Llama 3, and Typhoon-Audio, enabling comprehensive assessments of their performance in natural, conversational settings."
  },
  {
    "Category": "Model and Methods",
    "Type": "Multimodal Language Model",
    "Abbreviation": "Typhoon2-Audio",
    "Title": "Typhoon2-Audio: A Thai Multimodal Language Model for Speech and Text Processing",
    "Time": "2024-12",
    "Affiliation": "SCB 10X",
    "Author": "Kunat Pipatanakul, Potsawee Manakul, Natapong Nitarach, Warit Sirichotedumrong, Surapon Nonesung, Teetouch Jaknamon, Parinthapat Pengpun, Pittawat Taveekitworachai, Adisai Na-Thalang, Sittipong Sripaisarnmongkol, Krisanapong Jirayoot, Kasima Tharnpipitchai",
    "GitHub_Link": "https://github.com/scb-10x/typhoon2-audio/",
    "Paper_Link": "https://arxiv.org/abs/2412.13702",
    "HF_Link": "https://huggingface.co/scb10x/llama3.1-typhoon2-audio-8b-instruct",
    "Demo_Link": "https://audio.opentyphoon.ai/",
    "Other_Link": "",
    "Audio_Input": "Yes",
    "Audio_Output": "Yes",
    "Language": "Thai, English",
    "Description": "Typhoon2-Audio is a multimodal language model designed for Thai and English speech and text processing. It supports speech/audio input and both speech and text output, integrating components from SALMONN and Llama-Omni architectures. The model is trained on curated datasets to enhance instruction-following abilities and Thai language performance."
  },
  {
    "Category": "Benchmark",
    "Type": "Evaluation Framework",
    "Abbreviation": "Dynamic-SUPERB Phase-2",
    "Title": "Dynamic-SUPERB Phase-2: A Collaboratively Expanding Benchmark for Measuring the Capabilities of Spoken Language Models with 180 Tasks",
    "Time": "2024-11",
    "Affiliation": "National Taiwan University, University of Texas at Austin, Carnegie Mellon University, Nanyang Technological University, Toyota Technological Institute of Chicago, Université du Québec (INRS-EMT), NVIDIA, ASAPP, Renmin University of China",
    "Author": "Chien-yu Huang, Wei-Chih Chen, Shu-wen Yang, Andy T. Liu, Chen-An Li, Yu-Xiang Lin, Wei-Cheng Tseng, Anuj Diwan, Yi-Jen Shih, Jiatong Shi, William Chen, Xuanjun Chen, Chi-Yuan Hsiao, Puyuan Peng, Shih-Heng Wang, Chun-Yi Kuan, Haibin Wu, Siddhant Arora, Kai-Wei Chang, Yifan Peng, Roshan Sharma, Shinji Watanabe, Bhiksha Ramakrishnan, Shady Shehata, Hung-yi Lee",
    "GitHub_Link": "https://github.com/dynamic-superb/dynamic-superb",
    "Paper_Link": "https://arxiv.org/pdf/2411.05361",
    "HF_Link": "",
    "Demo_Link": "",
    "Other_Link": "https://dynamic-superb.github.io/",
    "Audio_Input": "Yes",
    "Audio_Output": "No",
    "Language": "Multilingual",
    "Description": "Dynamic-SUPERB Phase-2 is an open and evolving benchmark designed for the comprehensive evaluation of instruction-based universal speech models. Building upon its first generation, this second phase incorporates 125 new tasks contributed collaboratively by the global research community, expanding the benchmark to a total of 180 tasks. It broadens evaluation capabilities by introducing a wide array of novel and diverse tasks, including regression and sequence generation, across speech, music, and environmental audio domains. The benchmark aims to guide the development of universal spoken language models by providing a diverse and comprehensive evaluation platform."
  },
  {
    "Category": "Model and Methods",
    "Type": "Model",
    "Abbreviation": "Taiwanese AudioLLM",
    "Title": "Building a Taiwanese Mandarin Spoken Language Model: A First Attempt",
    "Time": "2024-11",
    "Affiliation": "National Taiwan University",
    "Author": "Chih-Kai Yang, Yu-Kuan Fu, Chen-An Li, Yi-Cheng Lin, Yu-Xiang Lin, Wei-Chih Chen, Ho Lam Chung, Chun-Yi Kuan, Wei-Ping Huang, Ke-Han Lu, Tzu-Quan Lin, Hsiu-Hsuan Wang, En-Pei Hu, Chan-Jan Hsu, Liang-Hsuan Tseng, I-Hsiang Chiu, Ulin Sanga, Xuanjun Chen, Po-chun Hsu, Shu-wen Yang, Hung-yi Lee",
    "GitHub_Link": "",
    "Paper_Link": "https://arxiv.org/pdf/2411.07111",
    "HF_Link": "",
    "Demo_Link": "",
    "Other_Link": "",
    "Audio_Input": "Yes",
    "Audio_Output": "Yes",
    "Language": "Taiwanese Mandarin",
    "Description": "This technical report presents an initial attempt to develop a spoken large language model (LLM) for Taiwanese Mandarin, tailored for real-time, speech-to-speech interactions in multi-turn conversations. The end-to-end model employs a decoder-only transformer architecture, aiming for seamless interaction with full-duplex capabilities that allow simultaneous speaking and listening. The report details the training process, including data preparation with synthesized dialogues and adjustments for real-time interaction, and introduces a platform to evaluate conversational fluency and response coherence in multi-turn dialogues."
  },
  {
    "Category": "Survey",
    "Type": "Survey",
    "Abbreviation": "WavChat-Survey",
    "Title": "WavChat: A Survey of Spoken Dialogue Models",
    "Time": "2024-11",
    "Affiliation": "Zhejiang University",
    "Author": "Shengpeng Ji, Yifu Chen, Minghui Fang, Jialong Zuo, Jingyu Lu, Hanting Wang, Ziyue Jiang, Long Zhou, Shujie Liu, Xize Cheng, Xiaoda Yang, Zehan Wang, Qian Yang, Jian Li, Yidi Jiang, Jingzhen He, Yunfei Chu, Jin Xu, Zhou Zhao",
    "GitHub_Link": "",
    "Paper_Link": "https://arxiv.org/abs/2411.13577",
    "HF_Link": "",
    "Demo_Link": "",
    "Other_Link": "",
    "Audio_Input": "No",
    "Audio_Output": "No",
    "Language": "",
    "Description": ""
  },
  {
    "Category": "Model and Methods",
    "Type": "Model",
    "Abbreviation": "DiVA",
    "Title": "Distilling an End-to-End Voice Assistant Without Instruction Training Data",
    "Time": "2024-10",
    "Affiliation": "Georgia Tech, Stanford",
    "Author": "William Held, Ella Li, Michael Ryan, Weiyan Shi, Yanzhe Zhang, Diyi Yang",
    "GitHub_Link": "https://github.com/diva-audio",
    "Paper_Link": "https://arxiv.org/pdf/2410.02678",
    "HF_Link": "",
    "Demo_Link": "https://diva-audio.github.io/",
    "Other_Link": "",
    "Audio_Input": "Yes",
    "Audio_Output": "Yes",
    "Language": "",
    "Description": "DiVA (Distilled Voice Assistant) is an end-to-end voice assistant model that integrates speech and text processing without relying on instruction training data. By utilizing self-supervision from a text-only large language model's responses to transcripts, DiVA generalizes to tasks such as spoken question answering, classification, and translation. Notably, it achieves a 72% user preference win rate compared to state-of-the-art models like Qwen 2 Audio, despite using significantly less training compute."
  },
  {
    "Category": "Speech Synthesis",
    "Type": "TTS Model",
    "Abbreviation": "F5-TTS",
    "Title": "F5-TTS: A Fairytaler that Fakes Fluent and Faithful Speech with Flow Matching",
    "Time": "2024-10",
    "Affiliation": "Shanghai Jiao Tong University",
    "Author": "Yushen Chen, Zhikang Niu, Ziyang Ma, Keqi Deng, Chunhui Wang, Jian Zhao, Kai Yu, Xie Chen",
    "GitHub_Link": "https://github.com/SWivid/F5-TTS",
    "Paper_Link": "https://arxiv.org/abs/2410.06885",
    "HF_Link": "https://huggingface.co/SWivid/F5-TTS",
    "Demo_Link": "https://swivid.github.io/F5-TTS/",
    "Other_Link": "",
    "Audio_Input": "Yes",
    "Audio_Output": "Yes",
    "Language": "Multilingual",
    "Description": "F5-TTS is a fully non-autoregressive TTS system based on flow matching with Diffusion Transformer, producing high-fidelity zero-shot voice cloning faster than autoregressive codec-LM TTS systems."
  },
  {
    "Category": "Benchmark",
    "Type": "Benchmark",
    "Abbreviation": "MMAU",
    "Title": "MMAU: A Massive Multi-Task Audio Understanding and Reasoning Benchmark",
    "Time": "2024-10",
    "Affiliation": "University of Maryland",
    "Author": "S Sakshi, Utkarsh Tyagi, Sonal Kumar, Ashish Seth, Ramaneswaran Selvakumar, Oriol Nieto, Ramani Duraiswami, Sreyan Ghosh, Dinesh Manocha",
    "GitHub_Link": "https://github.com/Sakshi113/mmau/tree/main",
    "Paper_Link": "https://arxiv.org/pdf/2410.19168",
    "HF_Link": "",
    "Demo_Link": "",
    "Other_Link": "https://sakshi113.github.io/mmau_homepage/",
    "Audio_Input": "",
    "Audio_Output": "",
    "Language": "English",
    "Description": ""
  },
  {
    "Category": "Model and Methods",
    "Type": "Model",
    "Abbreviation": "SPIRIT LM",
    "Title": "SPIRIT LM: Interleaved Spoken and Written Language Model",
    "Time": "2024-10",
    "Affiliation": "Meta",
    "Author": "Tu Anh Nguyen, Benjamin Muller, Bokai Yu, Marta R. Costa-jussa, Maha Elbayad, Sravya Popuri, Paul-Ambroise Duquenne, Robin Algayres, Ruslan Mavlyutov, Itai Gat, Gabriel Synnaeve, Juan Pino, Benoît Sagot, Emmanuel Dupoux",
    "GitHub_Link": "https://github.com/facebookresearch/spiritlm",
    "Paper_Link": "https://arxiv.org/pdf/2402.05755",
    "HF_Link": "",
    "Demo_Link": "https://speechbot.github.io/spiritlm/",
    "Other_Link": "",
    "Audio_Input": "Yes",
    "Audio_Output": "Yes",
    "Language": "",
    "Description": "SPIRIT LM is a foundational multimodal language model developed by Meta that seamlessly integrates text and speech modalities. By extending a pretrained text language model to the speech domain through continuous training on both text and speech units, SPIRIT LM can process interleaved speech and text sequences. It comes in two versions: BASE, utilizing speech phonetic units (HuBERT), and EXPRESSIVE, which incorporates pitch and style units to model expressivity. The model demonstrates capabilities in tasks such as ASR, TTS, and speech classification, leveraging few-shot learning across modalities."
  },
  {
    "Category": "Model and Methods",
    "Type": "Model",
    "Abbreviation": "SpeechEmotionLlama",
    "Title": "Frozen Large Language Models Can Perceive Paralinguistic Aspects of Speech",
    "Time": "2024-10",
    "Affiliation": "MIT, Meta",
    "Author": "Wonjune Kang, Junteng Jia, Chunyang Wu, Wei Zhou, Egor Lakomkin, Yashesh Gaur, Leda Sari, Suyoun Kim, Ke Li, Jay Mahadeokar, Ozlem Kalinli",
    "GitHub_Link": "",
    "Paper_Link": "https://arxiv.org/pdf/2410.01162",
    "HF_Link": "",
    "Demo_Link": "",
    "Other_Link": "",
    "Audio_Input": "Yes",
    "Audio_Output": "No",
    "Language": "",
    "Description": "This paper explores the capability of large language models (LLMs) to understand paralinguistic aspects of speech, such as emotions and speaking styles, without fine-tuning their weights. By training a speech encoder to produce token embeddings that align the LLM's responses to expressive speech prompts with semantically matching text prompts specifying the speaker's emotion, the system effectively conveys both semantic and paralinguistic information to the LLM. Experiments demonstrate that this approach enables LLMs to generate higher quality and more empathetic responses to expressive speech inputs."
  },
  {
    "Category": "Survey",
    "Type": "Survey",
    "Abbreviation": "SpeechLLM-Survey",
    "Title": "A Survey on Speech Large Language Models",
    "Time": "2024-10",
    "Affiliation": "SJTU, AISpeech",
    "Author": "Jing Peng, Yucheng Wang, Yu Xi, Xu Li, Xizhuo Zhang, Kai Yu",
    "GitHub_Link": "",
    "Paper_Link": "https://arxiv.org/pdf/2410.18908v2",
    "HF_Link": "",
    "Demo_Link": "",
    "Other_Link": "",
    "Audio_Input": "No",
    "Audio_Output": "No",
    "Language": "",
    "Description": ""
  },
  {
    "Category": "Survey",
    "Type": "Survey",
    "Abbreviation": "SpeechLM-Survey",
    "Title": "Recent Advances in Speech Language Models: A Survey",
    "Time": "2024-10",
    "Affiliation": "CUHK, Tencent",
    "Author": "Wenqian Cui, Dianzhi Yu, Xiaoqi Jiao, Ziqiao Meng, Guangyan Zhang, Qichao Wang, Yiwen Guo, Irwin King",
    "GitHub_Link": "",
    "Paper_Link": "https://arxiv.org/pdf/2410.03751",
    "HF_Link": "",
    "Demo_Link": "",
    "Other_Link": "",
    "Audio_Input": "No",
    "Audio_Output": "No",
    "Language": "",
    "Description": ""
  },
  {
    "Category": "Benchmark",
    "Type": "Benchmark",
    "Abbreviation": "VoiceBench",
    "Title": "VoiceBench: Benchmarking LLM-Based Voice Assistants",
    "Time": "2024-10",
    "Affiliation": "National University of Singapore",
    "Author": "Yiming Chen, Xianghu Yue, Chen Zhang, Xiaoxue Gao, Robby T. Tan, Haizhou Li",
    "GitHub_Link": "https://github.com/MatthewCYM/VoiceBench",
    "Paper_Link": "https://arxiv.org/pdf/2410.17196",
    "HF_Link": "",
    "Demo_Link": "",
    "Other_Link": "",
    "Audio_Input": "Yes",
    "Audio_Output": "No",
    "Language": "",
    "Description": "VoiceBench is a comprehensive evaluation framework designed to assess the capabilities of LLM-based voice assistants. It evaluates various aspects, including general knowledge, instruction-following abilities, and safety measures, using both synthetic and real spoken instruction data that reflect real-world variations such as speaker characteristics, environmental factors, and content complexities."
  },
  {
    "Category": "Model and Methods",
    "Type": "Model",
    "Abbreviation": "ASRCompare",
    "Title": "Comparing Discrete and Continuous Space LLMs for Speech Recognition",
    "Time": "2024-09",
    "Affiliation": "Tsinghua University, Tencent AI Lab",
    "Author": "Yaoxun Xu, Shi-Xiong Zhang, Jianwei Yu, Zhiyong Wu, Dong Yu",
    "GitHub_Link": "https://github.com/xuyaoxun/ASRCompare",
    "Paper_Link": "https://arxiv.org/pdf/2409.00800v1",
    "HF_Link": "",
    "Demo_Link": "",
    "Other_Link": "",
    "Audio_Input": "Yes",
    "Audio_Output": "No",
    "Language": "",
    "Description": "This paper investigates discrete and continuous speech representations in Large Language Model (LLM)-based Automatic Speech Recognition (ASR). It organizes these representations by feature continuity and training approach into four categories: supervised and unsupervised for both discrete and continuous types. The study further classifies LLMs based on their input and autoregressive feedback into continuous and discrete-space models. Using specialized encoders and comparative analysis with a Joint-Training-From-Scratch Language Model (JTFS LM) and pre-trained LLaMA2-7b, it provides a detailed examination of their effectiveness. Notably, the work presents an open-sourced achievement of a state-of-the-art Word Error Rate (WER) of 1.69% on LibriSpeech using a HuBERT encoder, offering valuable insights for advancing ASR and natural language processing research."
  },
  {
    "Category": "Model and Methods",
    "Type": "Model",
    "Abbreviation": "AudioBERT",
    "Title": "AudioBERT: Audio Knowledge Augmented Language Model",
    "Time": "2024-09",
    "Affiliation": "POSTECH, Inha University",
    "Author": "Hyunjong Ok, Suho Yoo, Jaeho Lee",
    "GitHub_Link": "https://github.com/HJ-Ok/AudioBERT",
    "Paper_Link": "https://arxiv.org/pdf/2409.08199",
    "HF_Link": "",
    "Demo_Link": "",
    "Other_Link": "",
    "Audio_Input": "Yes",
    "Audio_Output": "No",
    "Language": "",
    "Description": "AudioBERT is a language model augmented with auditory knowledge to enhance its performance on tasks requiring an understanding of sounds. It employs a retrieval-based approach, utilizing an Auditory Knowledge Span Detector to identify text spans necessitating auditory knowledge. Relevant audio embeddings are retrieved using CLAP (Contrastive Language-Audio Pretraining) and integrated into the language model. This method enables AudioBERT to effectively handle tasks such as animal sound recognition and sound pitch comparison, as demonstrated on the AuditoryBench dataset."
  },
  {
    "Category": "Model and Methods",
    "Type": "Model",
    "Abbreviation": "DeSTA2",
    "Title": "Developing Instruction-Following Speech Language Model Without Speech Instruction-Tuning Data",
    "Time": "2024-09",
    "Affiliation": "National Taiwan University, NVIDIA",
    "Author": "Ke-Han Lu, Zhehuai Chen, Szu-Wei Fu, Chao-Han Huck Yang, Jagadeesh Balam, Boris Ginsburg, Yu-Chiang Frank Wang, Hung-yi Lee",
    "GitHub_Link": "https://github.com/kehanlu/DeSTA2",
    "Paper_Link": "https://arxiv.org/pdf/2409.20007",
    "HF_Link": "",
    "Demo_Link": "",
    "Other_Link": "",
    "Audio_Input": "Yes",
    "Audio_Output": "No",
    "Language": "",
    "Description": "DeSTA2 is a speech-language model that integrates pre-trained speech models with large language models to interpret and generate comprehensive natural language descriptions. It enhances the model's speech comprehension capabilities without extensive speech instruction-tuning, thereby preserving the inherent language understanding of the text-based LLM. DeSTA2 demonstrates impressive performance on benchmarks like Dynamic-SUPERB and AIR-Bench-Chat, showcasing its ability to follow complex instructions derived from LLMs, such as specific output formatting and chain-of-thought reasoning."
  },
  {
    "Category": "Multimodal",
    "Type": "Model",
    "Abbreviation": "EMOVA",
    "Title": "EMOVA: Empowering Language Models to See, Hear and Speak with Vivid Emotions",
    "Time": "2024-09",
    "Affiliation": "HKUST",
    "Author": "Kai Chen, Yunhao Gou, Runhui Huang, Zhili Liu, Daxin Tan, Jing Xu, Chunwei Wang, Yi Zhu, Yihan Zeng, Kuo Yang, Dingdong Wang, Kun Xiang, Haoyuan Li, Haoli Bai, Jianhua Han, Xiaohui Li, Weike Jin, Nian Xie, Yu Zhang, James T. Kwok, Hengshuang Zhao, Xiaodan Liang, Dit-Yan Yeung, Xiao Chen, Zhenguo Li, Wei Zhang, Qun Liu, Jun Yao, Lanqing Hong, Lu Hou, Hang Xu",
    "GitHub_Link": "",
    "Paper_Link": "https://arxiv.org/pdf/2409.18042",
    "HF_Link": "",
    "Demo_Link": "https://emova-ollm.github.io/",
    "Other_Link": "",
    "Audio_Input": "Yes",
    "Audio_Output": "Yes",
    "Language": "English",
    "Description": ""
  },
  {
    "Category": "Model and Methods",
    "Type": "Model",
    "Abbreviation": "LLaMA-Omni",
    "Title": "LLaMA-Omni: Seamless Speech Interaction with Large Language Models",
    "Time": "2024-09",
    "Affiliation": "Institute of Computing Technology, Chinese Academy of Sciences (ICT/CAS)",
    "Author": "Qingkai Fang, Shoutao Guo, Yan Zhou, Zhengrui Ma, Shaolei Zhang, Yang Feng",
    "GitHub_Link": "https://github.com/ictnlp/llama-omni",
    "Paper_Link": "https://arxiv.org/pdf/2409.06666v1",
    "HF_Link": "",
    "Demo_Link": "",
    "Other_Link": "",
    "Audio_Input": "Yes",
    "Audio_Output": "Yes",
    "Language": "",
    "Description": "LLaMA-Omni is a low-latency, high-quality end-to-end speech interaction model built upon Llama-3.1-8B-Instruct. It enables seamless speech interactions with large language models, simultaneously generating both text and speech responses based on speech instructions. The model integrates a pretrained speech encoder, a speech adaptor, an LLM, and a streaming speech decoder, eliminating the need for intermediate speech transcription. Experimental results demonstrate that LLaMA-Omni provides superior responses in both content and style, with response latency as low as 226ms. Training LLaMA-Omni requires less than 3 days on 4 GPUs, facilitating efficient development of speech-language models."
  },
  {
    "Category": "Speech Synthesis",
    "Type": "TTS Model",
    "Abbreviation": "MaskGCT",
    "Title": "MaskGCT: Zero-Shot Text-to-Speech with Masked Generative Codec Transformer",
    "Time": "2024-09",
    "Affiliation": "The Chinese University of Hong Kong (Shenzhen), Amphion",
    "Author": "Yuancheng Wang, Haoyue Zhan, Liwei Liu, Ruihong Zeng, Haotian Guo, Jiachen Zheng, Qiang Zhang, Xueyao Zhang, Shunsi Zhang, Zhizheng Wu",
    "GitHub_Link": "https://github.com/open-mmlab/Amphion/tree/main/models/tts/maskgct",
    "Paper_Link": "https://arxiv.org/abs/2409.00750",
    "HF_Link": "https://huggingface.co/amphion/MaskGCT",
    "Demo_Link": "https://maskgct.github.io/",
    "Other_Link": "",
    "Audio_Input": "Yes",
    "Audio_Output": "Yes",
    "Language": "Multilingual",
    "Description": "MaskGCT is a fully non-autoregressive TTS system that predicts masked codec tokens with a transformer in two stages (text→duration→codec). Released as part of the Amphion toolkit; strong zero-shot voice cloning."
  },
  {
    "Category": "Model and Methods",
    "Type": "Model",
    "Abbreviation": "MoWE-Audio",
    "Title": "MoWE-Audio: Multitask AudioLLMs with Mixture of Weak Encoders",
    "Time": "2024-09",
    "Affiliation": "A*STAR",
    "Author": "Wenyu Zhang, Shuo Sun, Bin Wang, Xunlong Zou, Zhuohan Liu, Yingxu He, Geyu Lin, Nancy F. Chen, Ai Ti Aw",
    "GitHub_Link": "",
    "Paper_Link": "https://arxiv.org/pdf/2409.06635",
    "HF_Link": "",
    "Demo_Link": "",
    "Other_Link": "",
    "Audio_Input": "Yes",
    "Audio_Output": "No",
    "Language": "",
    "Description": "MoWE-Audio introduces a novel approach to enhance Audio Large Language Models (AudioLLMs) by incorporating a mixture of 'weak' encoders. This method supplements a base encoder with a pool of lightweight encoders, selectively activated based on the audio input, to improve feature extraction without significantly increasing model size. Empirical results demonstrate that MoWE effectively enhances multi-task performance, broadening the applicability of AudioLLMs to more diverse audio tasks."
  },
  {
    "Category": "Model and Methods",
    "Type": "Model",
    "Abbreviation": "Moshi",
    "Title": "Moshi: a speech-text foundation model for real-time dialogue",
    "Time": "2024-09",
    "Affiliation": "Kyutai",
    "Author": "Alexandre Défossez, Laurent Mazaré, Manu Orsini, Amélie Royer, Patrick Pérez, Hervé Jégou, Edouard Grave, Neil Zeghidour",
    "GitHub_Link": "https://github.com/kyutai-labs/moshi",
    "Paper_Link": "https://arxiv.org/pdf/2410.00037",
    "HF_Link": "",
    "Demo_Link": "",
    "Other_Link": "",
    "Audio_Input": "Yes",
    "Audio_Output": "Yes",
    "Language": "",
    "Description": "Moshi is a speech-text foundation model and full-duplex spoken dialogue framework that addresses limitations in current spoken dialogue systems by integrating speech recognition and generation into a single model. It enables real-time, natural conversations by reducing latency and preserving non-linguistic information such as emotion and accent. Moshi models multiple audio streams in parallel, allowing for seamless handling of overlapping speech and interruptions, thereby enhancing the naturalness of human-computer interactions."
  },
  {
    "Category": "Benchmark",
    "Type": "Benchmark",
    "Abbreviation": "SALMon",
    "Title": "A Suite for Acoustic Language Model Evaluation",
    "Time": "2024-09",
    "Affiliation": "Hebrew University of Jerusalem",
    "Author": "Gallil Maimon, Amit Roth, Yossi Adi",
    "GitHub_Link": "https://github.com/adiyoss-lab/salmon",
    "Paper_Link": "https://arxiv.org/abs/2409.07437",
    "HF_Link": "",
    "Demo_Link": "https://pages.cs.huji.ac.il/adiyoss-lab/salmon/",
    "Other_Link": "",
    "Audio_Input": "Yes",
    "Audio_Output": "No",
    "Language": "",
    "Description": "SALMon is a novel evaluation suite encompassing background noise, emotion, speaker identity, and room impulse response. It evaluates both the consistency of the inspected element and its alignment with the spoken text, providing a comprehensive benchmark for speech language models."
  },
  {
    "Category": "Model and Methods",
    "Type": "Model",
    "Abbreviation": "Ultravox",
    "Title": "Ultravox: A Fast Multimodal LLM for Real-Time Voice",
    "Time": "2024-09",
    "Affiliation": "Fixie.ai",
    "Author": "",
    "GitHub_Link": "https://github.com/fixie-ai/ultravox",
    "Paper_Link": "",
    "HF_Link": "",
    "Demo_Link": "",
    "Other_Link": "",
    "Audio_Input": "Yes",
    "Audio_Output": "No",
    "Language": "Multilingual",
    "Description": "Ultravox is an open-source multimodal large language model (LLM) designed for real-time voice interactions. It extends any open-weight LLM with a multimodal projector that converts audio directly into the high-dimensional space used by LLMs, eliminating the need for a separate Automatic Speech Recognition (ASR) stage. This direct coupling allows Ultravox to respond more quickly than systems that combine separate ASR and LLM components. The current version (v0.4) supports multiple languages, including Arabic, Chinese, Dutch, English, French, German, Hindi, Italian, Japanese, Portuguese, Russian, Spanish, Swedish, Turkish, and Ukrainian. Ultravox is capable of understanding both text and human speech, making it suitable for applications such as voice agents, speech-to-speech translation, and analysis of spoken audio."
  },
  {
    "Category": "Model and Methods",
    "Type": "Model",
    "Abbreviation": "Mini-Omni",
    "Title": "Mini-Omni: Language Models Can Hear, Talk While Thinking in Streaming",
    "Time": "2024-08",
    "Affiliation": "Tsinghua University",
    "Author": "Zhifei Xie, Changqiao Wu",
    "GitHub_Link": "https://github.com/gpt-omni/mini-omni",
    "Paper_Link": "https://arxiv.org/pdf/2408.16725",
    "HF_Link": "",
    "Demo_Link": "",
    "Other_Link": "",
    "Audio_Input": "Yes",
    "Audio_Output": "Yes",
    "Language": "",
    "Description": "Mini-Omni is an open-source multimodal large language model designed for real-time speech interaction. It features end-to-end speech input and streaming audio output capabilities, enabling seamless voice conversations without the need for separate ASR or TTS systems. The model employs a text-instructed speech generation method and batch-parallel strategies during inference to enhance performance. Additionally, the VoiceAssistant-400K dataset is introduced to fine-tune models optimized for speech output. Mini-Omni aims to facilitate real-time human-computer interaction by integrating speech processing directly into the language model framework."
  },
  {
    "Category": "Model and Methods",
    "Type": "Model",
    "Abbreviation": "MooER",
    "Title": "MooER: LLM-based Speech Recognition and Translation Models from Moore Threads",
    "Time": "2024-08",
    "Affiliation": "Moore Threads",
    "Author": "Zhenlin Liang, Junhao Xu, Yi Liu, Yichao Hu, Jian Li, Yajun Zheng, Meng Cai, Hua Wang",
    "GitHub_Link": "https://github.com/MooreThreads/MooER",
    "Paper_Link": "https://arxiv.org/pdf/2408.05101",
    "HF_Link": "",
    "Demo_Link": "",
    "Other_Link": "",
    "Audio_Input": "Yes",
    "Audio_Output": "No",
    "Language": "Multilingual",
    "Description": "MooER is a Large Language Model (LLM)-based system developed by Moore Threads for automatic speech recognition (ASR) and automatic speech translation (AST). Trained on a 5,000-hour pseudo-labeled dataset comprising open-source and self-collected speech data, MooER achieves performance comparable to other open-source models trained on significantly larger datasets. Notably, it attains a BLEU score of 25.2 on the Covost2 Zh2en test set, indicating superior translation capabilities. The model architecture integrates an encoder, adapter, and decoder (LLM), optimized with techniques such as DeepSpeed, data loader acceleration, gradient checkpointing, gradient accumulation, and BF16 training. MooER supports multiple languages and is designed to facilitate end-to-end speech interaction, translation, and recognition tasks."
  },
  {
    "Category": "Benchmark",
    "Type": "Benchmark",
    "Abbreviation": "MuChoMusic",
    "Title": "MuChoMusic: Evaluating Music Understanding in Multimodal Audio-Language Models",
    "Time": "2024-08",
    "Affiliation": "UPF, QMUL, UMG",
    "Author": "Benno Weck, Ilaria Manco, Emmanouil Benetos, Elio Quinton, George Fazekas, Dmitry Bogdanov",
    "GitHub_Link": "https://github.com/mulab-mir/muchomusic",
    "Paper_Link": "https://arxiv.org/abs/2408.01337",
    "HF_Link": "",
    "Demo_Link": "",
    "Other_Link": "",
    "Audio_Input": "Yes",
    "Audio_Output": "No",
    "Language": "",
    "Description": ""
  },
  {
    "Category": "Model and Methods",
    "Type": "Multimodal Language Model",
    "Abbreviation": "Typhoon-Audio",
    "Title": "Typhoon-Audio: Enhancing Low-Resource Language and Instruction Following Capabilities of Audio Language Models",
    "Time": "2024-08",
    "Affiliation": "SCB 10X",
    "Author": "Potsawee Manakul, Guangzhi Sun, Warit Sirichotedumrong, Kasima Tharnpipitchai, Kunat Pipatanakul",
    "GitHub_Link": "",
    "Paper_Link": "https://arxiv.org/abs/2409.10999",
    "HF_Link": "https://huggingface.co/scb10x/llama-3-typhoon-v1.5-8b-audio-preview",
    "Demo_Link": "",
    "Other_Link": "",
    "Audio_Input": "Yes",
    "Audio_Output": "No",
    "Language": "Thai, English",
    "Description": "Typhoon-Audio is a multimodal language model supporting speech/audio input and text output. Based on the SALMONN architecture, it is trained on curated datasets to enhance general instruction-following abilities and performance in the Thai language, addressing challenges in low-resource language processing."
  },
  {
    "Category": "Multimodal",
    "Type": "Omni-Modal LLM",
    "Abbreviation": "VITA",
    "Title": "VITA: Towards Open-Source Interactive Omni Multimodal LLM",
    "Time": "2024-08",
    "Affiliation": "Tencent Youtu Lab, Nanjing University, Xiamen University",
    "Author": "Chaoyou Fu, Haojia Lin, Zuwei Long, Yunhang Shen, Yuhang Dai, Meng Zhao, Yi-Fan Zhang, Ke Li, Xiawu Zheng, Rongrong Ji",
    "GitHub_Link": "https://github.com/VITA-MLLM/VITA",
    "Paper_Link": "https://arxiv.org/abs/2408.05211",
    "HF_Link": "https://huggingface.co/VITA-MLLM/VITA",
    "Demo_Link": "https://vita-home.github.io/",
    "Other_Link": "",
    "Audio_Input": "Yes",
    "Audio_Output": "Yes",
    "Language": "Multilingual",
    "Description": "VITA is one of the first fully open-source omni-modal LLMs supporting interactive video, image, audio, and text inputs with non-awakening interaction and audio interrupt — establishing a recipe later adopted by many Chinese omni models."
  },
  {
    "Category": "Benchmark",
    "Type": "Benchmark",
    "Abbreviation": "AudioEntailment",
    "Title": "Audio Entailment: Assessing Deductive Reasoning for Audio Understanding",
    "Time": "2024-07",
    "Affiliation": "CMU, Microsoft",
    "Author": "Soham Deshmukh, Shuo Han, Hazim Bukhari, Benjamin Elizalde, Hannes Gamper, Rita Singh, Bhiksha Raj",
    "GitHub_Link": "https://github.com/microsoft/AudioEntailment",
    "Paper_Link": "https://arxiv.org/pdf/2407.18062",
    "HF_Link": "",
    "Demo_Link": "",
    "Other_Link": "",
    "Audio_Input": "Yes",
    "Audio_Output": "No",
    "Language": "",
    "Description": ""
  },
  {
    "Category": "Model and Methods",
    "Type": "Model",
    "Abbreviation": "CompA",
    "Title": "CompA: Addressing the Gap in Compositional Reasoning in Audio-Language Models",
    "Time": "2024-07",
    "Affiliation": "University of Maryland, College Park; Adobe, USA; NVIDIA, Bangalore, India",
    "Author": "Sreyan Ghosh, Ashish Seth, Sonal Kumar, Utkarsh Tyagi, Chandra Kiran Evuru, S. Ramaneswaran, S. Sakshi, Oriol Nieto, Ramani Duraiswami, Dinesh Manocha",
    "GitHub_Link": "https://github.com/Sreyan88/CompA",
    "Paper_Link": "https://arxiv.org/abs/2310.08753",
    "HF_Link": "",
    "Demo_Link": "https://sreyan88.github.io/compa_iclr/",
    "Other_Link": "",
    "Audio_Input": "Yes",
    "Audio_Output": "No",
    "Language": "",
    "Description": "CompA introduces two expert-annotated benchmarks, CompA-order and CompA-attribute, designed to evaluate compositional reasoning in audio-language models (ALMs). CompA-order assesses an ALM's understanding of the sequence of acoustic events, while CompA-attribute evaluates attribute-binding of these events. The study reveals that current ALMs perform marginally better than random chance in compositional reasoning tasks. To address this, the authors propose CompA-CLAP, a fine-tuned model employing a novel learning method with composition-aware hard negatives and a modular contrastive loss, enhancing fine-grained compositional understanding without relying on extensive compositional audio datasets. CompA-CLAP demonstrates significant improvements over baseline models on the CompA benchmark, indicating its superior compositional reasoning capabilities."
  },
  {
    "Category": "Model and Methods",
    "Type": "Research",
    "Abbreviation": "Decoder-only LLMs for STT",
    "Title": "Investigating Decoder-only Large Language Models for Speech-to-text Translation",
    "Time": "2024-07",
    "Affiliation": "NTU-Taiwan, Meta",
    "Author": "Authors not specified in the provided information",
    "GitHub_Link": "",
    "Paper_Link": "https://arxiv.org/pdf/2407.03169",
    "HF_Link": "",
    "Demo_Link": "",
    "Other_Link": "",
    "Audio_Input": "Yes",
    "Audio_Output": "No",
    "Language": "Multilingual",
    "Description": "This research paper explores the application of decoder-only large language models for speech-to-text translation, analyzing their effectiveness and potential advantages in multilingual translation tasks."
  },
  {
    "Category": "Model and Methods",
    "Type": "Model",
    "Abbreviation": "FunAudioLLM",
    "Title": "FunAudioLLM: Voice Understanding and Generation Foundation Models for Natural Interaction Between Humans and LLMs",
    "Time": "2024-07",
    "Affiliation": "Alibaba",
    "Author": "Authors not specified in the provided information",
    "GitHub_Link": "https://github.com/FunAudioLLM",
    "Paper_Link": "https://arxiv.org/pdf/2407.04051v3",
    "HF_Link": "",
    "Demo_Link": "https://fun-audio-llm.github.io/",
    "Other_Link": "",
    "Audio_Input": "Yes",
    "Audio_Output": "Yes",
    "Language": "Multilingual",
    "Description": "FunAudioLLM is a foundation model developed by Alibaba for voice understanding and generation, facilitating natural interaction between humans and large language models. It supports multilingual audio input and output, enabling seamless voice-based communication and interaction."
  },
  {
    "Category": "Model and Methods",
    "Type": "Model",
    "Abbreviation": "GAMA",
    "Title": "GAMA: A Large Audio-Language Model with Advanced Audio Understanding and Complex Reasoning Abilities",
    "Time": "2024-07",
    "Affiliation": "University of Maryland, College Park",
    "Author": "Sreyan Ghosh, Sonal Kumar, Ashish Seth, Chandra Kiran Reddy Evuru, Utkarsh Tyagi, S Sakshi, Oriol Nieto, Ramani Duraiswami, Dinesh Manocha",
    "GitHub_Link": "https://github.com/Sreyan88/GAMA",
    "Paper_Link": "https://arxiv.org/abs/2406.11768",
    "HF_Link": "",
    "Demo_Link": "https://sreyan88.github.io/gamaaudio/",
    "Other_Link": "",
    "Audio_Input": "Yes",
    "Audio_Output": "No",
    "Language": "",
    "Description": "GAMA is a General-purpose Large Audio-Language Model (LALM) designed to enhance audio understanding and complex reasoning abilities. It integrates a Large Language Model (LLM) with multiple types of audio representations, including features from a custom Audio Q-Former and a multi-layer aggregator that processes features from various layers of an audio encoder. Fine-tuned on a large-scale audio-language dataset, GAMA is equipped with advanced audio understanding capabilities. Additionally, it employs CompA-R, a synthetically generated instruction-tuning dataset, to endow the model with complex reasoning abilities, particularly for open-ended audio question-answering tasks. GAMA outperforms existing LALMs across diverse audio understanding tasks, demonstrating superior performance in both automated and expert human evaluations."
  },
  {
    "Category": "Model and Methods",
    "Type": "Model",
    "Abbreviation": "LLaST",
    "Title": "LLaST: Improved End-to-end Speech Translation System Leveraged by Large Language Models",
    "Time": "2024-07",
    "Affiliation": "The Chinese University of Hong Kong, Shenzhen; Shanghai AI Laboratory; Nara Institute of Science and Technology, Japan",
    "Author": "Xi Chen, Songyang Zhang, Qibing Bai, Kai Chen, Satoshi Nakamura",
    "GitHub_Link": "https://github.com/openaudiolab/LLaST",
    "Paper_Link": "https://arxiv.org/pdf/2407.15415",
    "HF_Link": "",
    "Demo_Link": "",
    "Other_Link": "",
    "Audio_Input": "Yes",
    "Audio_Output": "No",
    "Language": "Multilingual",
    "Description": "LLaST is a framework designed to enhance end-to-end speech-to-text translation systems by leveraging Large Language Models (LLMs). It addresses limitations in traditional E2E ST models through innovative architecture design and optimization techniques, including ASR-augmented training, multilingual data augmentation, and dual-LoRA optimization. Evaluations on the CoVoST-2 benchmark demonstrate LLaST's superior performance and scalability, making it a strong baseline for future speech translation research."
  },
  {
    "Category": "Model and Methods",
    "Type": "Model",
    "Abbreviation": "Qwen2-Audio",
    "Title": "Qwen2-Audio Technical Report",
    "Time": "2024-07",
    "Affiliation": "Alibaba Group",
    "Author": "Yunfei Chu, Jin Xu, Qian Yang, Haojie Wei, Xipin Wei, Zhifang Guo, Yichong Leng, Yuanjun Lv, Jinzheng He, Junyang Lin, Chang Zhou, Jingren Zhou",
    "GitHub_Link": "https://github.com/QwenLM/Qwen2-Audio",
    "Paper_Link": "https://arxiv.org/pdf/2407.10759",
    "HF_Link": "",
    "Demo_Link": "",
    "Other_Link": "",
    "Audio_Input": "Yes",
    "Audio_Output": "No",
    "Language": "Multilingual",
    "Description": "Qwen2-Audio is a large-scale audio-language model developed by Alibaba Group, capable of accepting various audio signal inputs and performing audio analysis or generating textual responses based on speech instructions. It introduces two distinct audio interaction modes: voice chat, allowing users to engage in voice interactions without text input, and audio analysis, enabling users to provide audio and text instructions for analysis during interaction. The model has been enhanced with instruction-following capabilities and optimized using Direct Preference Optimization (DPO) to improve performance in terms of factuality and adherence to desired behavior. Evaluations indicate that Qwen2-Audio outperforms previous state-of-the-art models in audio-centric instruction-following tasks."
  },
  {
    "Category": "Speech Recognition",
    "Type": "Speech Understanding Model",
    "Abbreviation": "SenseVoice",
    "Title": "FunAudioLLM: Voice Understanding and Generation Foundation Models for Natural Interaction Between Humans and LLMs",
    "Time": "2024-07",
    "Affiliation": "Tongyi SpeechTeam, Alibaba Group",
    "Author": "FunAudioLLM Team",
    "GitHub_Link": "https://github.com/FunAudioLLM/SenseVoice",
    "Paper_Link": "https://arxiv.org/abs/2407.04051",
    "HF_Link": "https://huggingface.co/FunAudioLLM/SenseVoiceSmall",
    "Demo_Link": "",
    "Other_Link": "",
    "Audio_Input": "Yes",
    "Audio_Output": "No",
    "Language": "Multilingual",
    "Description": "SenseVoice is a multilingual speech understanding foundation model (part of the FunAudioLLM report) that jointly performs speech recognition, spoken emotion recognition, and audio event detection. The non-autoregressive SenseVoice-Small (234M params) supports 50+ languages and runs roughly 5x faster than Whisper-Large at comparable accuracy."
  },
  {
    "Category": "Audio Generation",
    "Type": "Audio Generation Model",
    "Abbreviation": "Stable Audio Open",
    "Title": "Stable Audio Open",
    "Time": "2024-07",
    "Affiliation": "Stability AI",
    "Author": "Zach Evans, Julian D. Parker, CJ Carr, Zack Zukowski, Josiah Taylor, Jordi Pons",
    "GitHub_Link": "",
    "Paper_Link": "https://arxiv.org/abs/2407.14358",
    "HF_Link": "https://huggingface.co/stabilityai/stable-audio-open-1.0",
    "Demo_Link": "",
    "Other_Link": "",
    "Audio_Input": "No",
    "Audio_Output": "Yes",
    "Language": "English",
    "Description": "Stable Audio Open is Stability AI's open-weight text-to-audio diffusion model for generating short stereo audio clips (up to ~47s) including sound effects and music samples, trained only on Creative Commons audio."
  },
  {
    "Category": "Study",
    "Type": "Research",
    "Abbreviation": "Audio Hallucination",
    "Title": "Understanding Sounds, Missing the Questions: The Challenge of Object Hallucination in Large Audio-Language Models",
    "Time": "2024-06",
    "Affiliation": "NTU-Taiwan",
    "Author": "Chun-Yi Kuan, Wei-Ping Huang, Hung-yi Lee",
    "GitHub_Link": "https://github.com/kuan2jiu99/audio-hallucination",
    "Paper_Link": "https://arxiv.org/pdf/2406.08402",
    "HF_Link": "",
    "Demo_Link": "",
    "Other_Link": "",
    "Audio_Input": "Yes",
    "Audio_Output": "No",
    "Language": "",
    "Description": ""
  },
  {
    "Category": "Benchmark",
    "Type": "Benchmark",
    "Abbreviation": "AudioBench",
    "Title": "AudioBench: A Universal Benchmark for Audio Large Language Models",
    "Time": "2024-06",
    "Affiliation": "A*STAR, Singapore",
    "Author": "Bin Wang, Xunlong Zou, Geyu Lin, Shuo Sun, Zhuohan Liu, Wenyu Zhang, Zhengyuan Liu, AiTi Aw, Nancy F. Chen",
    "GitHub_Link": "https://github.com/AudioLLMs/AudioBench",
    "Paper_Link": "https://arxiv.org/abs/2406.16020",
    "HF_Link": "",
    "Demo_Link": "https://huggingface.co/spaces/AudioLLMs/AudioBench-Leaderboard",
    "Other_Link": "",
    "Audio_Input": "Yes",
    "Audio_Output": "No",
    "Language": "",
    "Description": "AudioBench is a universal benchmark designed to evaluate Audio Large Language Models (AudioLLMs). It encompasses 8 distinct tasks and 26 datasets, including 7 newly proposed datasets, targeting speech understanding, audio scene understanding, and voice understanding (paralinguistic)."
  },
  {
    "Category": "Safety",
    "Type": "Safety",
    "Abbreviation": "CodecFake",
    "Title": "CodecFake: Enhancing Anti-Spoofing Models Against Deepfake Audios from Codec-Based Speech Synthesis Systems",
    "Time": "2024-06",
    "Affiliation": "National Taiwan University",
    "Author": "Haibin Wu, Yuan Tseng, Hung-yi Lee",
    "GitHub_Link": "https://github.com/roger-tseng/CodecFake",
    "Paper_Link": "https://arxiv.org/abs/2406.07237",
    "HF_Link": "",
    "Demo_Link": "",
    "Other_Link": "https://codecfake.github.io/",
    "Audio_Input": "",
    "Audio_Output": "",
    "Language": "English",
    "Description": ""
  },
  {
    "Category": "Model and Methods",
    "Type": "Model",
    "Abbreviation": "DeSTA",
    "Title": "DeSTA: Enhancing Speech Language Models through Descriptive Speech-Text Alignment",
    "Time": "2024-06",
    "Affiliation": "NTU-Taiwan, Nvidia",
    "Author": "Authors not specified in the provided information",
    "GitHub_Link": "https://github.com/kehanlu/Nemo/tree/desta/examples/multimodal/DeSTA",
    "Paper_Link": "https://arxiv.org/abs/2406.18871",
    "HF_Link": "",
    "Demo_Link": "",
    "Other_Link": "",
    "Audio_Input": "Yes",
    "Audio_Output": "No",
    "Language": "Multilingual",
    "Description": "DeSTA is a model that enhances speech language models by aligning descriptive speech and text, improving the model's ability to understand and generate accurate transcriptions across multiple languages."
  },
  {
    "Category": "Speech Synthesis",
    "Type": "TTS Model",
    "Abbreviation": "E2 TTS",
    "Title": "E2 TTS: Embarrassingly Easy Fully Non-Autoregressive Zero-Shot TTS",
    "Time": "2024-06",
    "Affiliation": "Microsoft",
    "Author": "Sefik Emre Eskimez, Xiaofei Wang, Manthan Thakker, Canrun Li, Chung-Hsien Tsai, Zhen Xiao, Hemin Yang, Zirun Zhu, Min Tang, Xu Tan, Yanqing Liu, Sheng Zhao, Naoyuki Kanda",
    "GitHub_Link": "",
    "Paper_Link": "https://arxiv.org/abs/2406.18009",
    "HF_Link": "",
    "Demo_Link": "https://www.microsoft.com/en-us/research/project/e2-tts/",
    "Other_Link": "",
    "Audio_Input": "Yes",
    "Audio_Output": "Yes",
    "Language": "English",
    "Description": "E2 TTS removes nearly all of the usual TTS pipeline complexity — no phoneme aligner, no duration predictor, no explicit grapheme-to-phoneme model — and trains a flow-matching transformer end-to-end on text + audio. Foundational influence on F5-TTS and follow-on systems."
  },
  {
    "Category": "Model and Methods",
    "Type": "Model",
    "Abbreviation": "MusiLingo",
    "Title": "MusiLingo: Bridging Music and Text with Pre-trained Language Models for Music Captioning and Query Response",
    "Time": "2024-06",
    "Affiliation": "University of Pennsylvania",
    "Author": "Zihao Deng, Yinghao Ma, Yudong Liu, Rongchen Guo, Ge Zhang, Wenhu Chen, Wenhao Huang, Emmanouil Benetos",
    "GitHub_Link": "https://github.com/zihaod/MusiLingo",
    "Paper_Link": "https://arxiv.org/pdf/2309.08730",
    "HF_Link": "",
    "Demo_Link": "",
    "Other_Link": "",
    "Audio_Input": "Yes",
    "Audio_Output": "No",
    "Language": "English",
    "Description": "MusiLingo is a novel system that bridges music audio and language by aligning MERT and a frozen LLM via a single projection layer, enabling high-quality music captioning and question answering, supported by the newly introduced MusicInstruct dataset."
  },
  {
    "Category": "Benchmark",
    "Type": "Benchmark",
    "Abbreviation": "SD-Eval",
    "Title": "SD-Eval: A Benchmark Dataset for Spoken Dialogue Understanding Beyond Words",
    "Time": "2024-06",
    "Affiliation": "CUHK, Bytedance",
    "Author": "Junyi Ao, Yuancheng Wang, Xiaohai Tian, Dekun Chen, Jun Zhang, Lu Lu, Yuxuan Wang, Haizhou Li, Zhizheng Wu",
    "GitHub_Link": "https://github.com/amphionspace/SD-Eval",
    "Paper_Link": "https://arxiv.org/pdf/2406.13340",
    "HF_Link": "",
    "Demo_Link": "",
    "Other_Link": "",
    "Audio_Input": "Yes",
    "Audio_Output": "No",
    "Language": "",
    "Description": ""
  },
  {
    "Category": "Model and Methods",
    "Type": "Model",
    "Abbreviation": "Speech ReaLLM",
    "Title": "Speech ReaLLM – Real-time Streaming Speech Recognition with Multimodal LLMs by Teaching the Flow of Time",
    "Time": "2024-06",
    "Affiliation": "Meta",
    "Author": "Authors not specified in the provided information",
    "GitHub_Link": "",
    "Paper_Link": "https://arxiv.org/pdf/2406.09569",
    "HF_Link": "",
    "Demo_Link": "",
    "Other_Link": "",
    "Audio_Input": "Yes",
    "Audio_Output": "No",
    "Language": "Multilingual",
    "Description": "Speech ReaLLM is a real-time streaming speech recognition model developed by Meta, utilizing multimodal large language models to understand the temporal flow of speech for accurate and efficient transcription."
  },
  {
    "Category": "Benchmark",
    "Type": "Benchmark",
    "Abbreviation": "AIR-Bench",
    "Title": "AIR-Bench: Benchmarking Large Audio-Language Models via Generative Comprehension",
    "Time": "2024-05",
    "Affiliation": "ZJU, Alibaba",
    "Author": "Qian Yang, Jin Xu, Wenrui Liu, Yunfei Chu, Ziyue Jiang, Xiaohuan Zhou, Yichong Leng, Yuanjun Lv, Zhou Zhao, Chang Zhou, Jingren Zhou",
    "GitHub_Link": "https://github.com/OFA-Sys/AIR-Bench",
    "Paper_Link": "https://aclanthology.org/2024.acl-long.109/",
    "HF_Link": "",
    "Demo_Link": "",
    "Other_Link": "",
    "Audio_Input": "Yes",
    "Audio_Output": "Yes",
    "Language": "",
    "Description": ""
  },
  {
    "Category": "Model and Methods",
    "Type": "Model",
    "Abbreviation": "Audio Flamingo",
    "Title": "Audio Flamingo: A Novel Audio Language Model with Few-Shot Learning and Dialogue Abilities",
    "Time": "2024-05",
    "Affiliation": "Nvidia",
    "Author": "Authors not specified in the provided information",
    "GitHub_Link": "https://github.com/NVIDIA/audio-flamingo",
    "Paper_Link": "https://arxiv.org/abs/2402.01831",
    "HF_Link": "",
    "Demo_Link": "",
    "Other_Link": "",
    "Audio_Input": "Yes",
    "Audio_Output": "Yes",
    "Language": "Multilingual",
    "Description": "Audio Flamingo is an audio language model developed by Nvidia, featuring few-shot learning and dialogue capabilities. It supports multilingual audio input and output, enabling natural and context-aware audio interactions."
  },
  {
    "Category": "Model and Methods",
    "Type": "Model",
    "Abbreviation": "SpeechVerse",
    "Title": "SpeechVerse: A Large-scale Generalizable Audio Language Model",
    "Time": "2024-05",
    "Affiliation": "Amazon AGI",
    "Author": "Nilaksh Das, Saket Dingliwal, Srikanth Ronanki, Rohit Paturi, Zhaocheng Huang, Prashant Mathur, Jie Yuan, Dhanush Bekal, Xing Niu, Sai Muralidhar Jayanthi, Xilai Li, Karel Mundnich, Monica Sunkara, Sravan Bodapati, Sundararajan Srinivasan, Kyu J Han, Katrin Kirchhoff",
    "GitHub_Link": "",
    "Paper_Link": "https://arxiv.org/abs/2405.08295",
    "HF_Link": "",
    "Demo_Link": "",
    "Other_Link": "",
    "Audio_Input": "Yes",
    "Audio_Output": "No",
    "Language": "Multilingual",
    "Description": "SpeechVerse is a large-scale multitask audio language model from Amazon AGI. It pairs a frozen speech foundation model with an LLM through a small trainable adapter and is supervised on a broad mixture of speech tasks via natural-language instructions, achieving strong zero- and few-shot generalisation across 11 speech understanding tasks."
  },
  {
    "Category": "Safety",
    "Type": "Method",
    "Abbreviation": "VoiceJailbreak",
    "Title": "Voice Jailbreak Attacks Against GPT-4o",
    "Time": "2024-05",
    "Affiliation": "CISPA",
    "Author": "Xinyue Shen, Yixin Wu, Michael Backes, Yang Zhang",
    "GitHub_Link": "https://github.com/TrustAIRLab/VoiceJailbreakAttack",
    "Paper_Link": "https://arxiv.org/pdf/2405.19103",
    "HF_Link": "",
    "Demo_Link": "",
    "Other_Link": "",
    "Audio_Input": "Yes",
    "Audio_Output": "No",
    "Language": "Multilingual",
    "Description": ""
  },
  {
    "Category": "Dataset Resource",
    "Type": "Dataset Resource",
    "Abbreviation": "LibriSQA",
    "Title": "LibriSQA: A Novel Dataset and Framework for Spoken Question Answering with Large Language Models",
    "Time": "2024-04",
    "Affiliation": "Shanghai Jiao Tong University",
    "Author": "Zihan Zhao, Yiyang Jiang, Heyang Liu, Yanfeng Wang, Yu Wang",
    "GitHub_Link": "https://github.com/ZihanZhaoSJTU/LibriSQA",
    "Paper_Link": "https://arxiv.org/abs/2308.10390",
    "HF_Link": "",
    "Demo_Link": "",
    "Other_Link": "",
    "Audio_Input": "-",
    "Audio_Output": "-",
    "Language": "English",
    "Description": "While Large Language Models (LLMs) have demonstrated commendable performance across a myriad of domains and tasks, existing LLMs still exhibit a palpable deficit in handling multimodal functionalities, especially for the Spoken Question Answering (SQA) task which necessitates precise alignment and deep interaction between speech and text features. To address the SQA challenge on LLMs, we initially curated the free-form and open-ended LibriSQA dataset from Librispeech, comprising Part I with natural conversational formats and Part II encompassing multiple-choice questions followed by answers and analytical segments. Both parts collectively include 107k SQA pairs that cover various topics. Given the evident paucity of existing speech-text LLMs, we propose a lightweight, end-to-end framework to execute the SQA task on the LibriSQA, witnessing significant results. By reforming ASR into the SQA format, we further substantiate our framework's capability in handling ASR tasks. Our empirical findings bolster the LLMs' aptitude for aligning and comprehending multimodal information, paving the way for the development of universal multimodal LLMs."
  },
  {
    "Category": "Model and Methods",
    "Type": "Model",
    "Abbreviation": "SALMONN",
    "Title": "SALMONN: Towards Generic Hearing Abilities for Large Language Models",
    "Time": "2024-04",
    "Affiliation": "Tsinghua",
    "Author": "Authors not specified in the provided information",
    "GitHub_Link": "https://github.com/bytedance/SALMONN",
    "Paper_Link": "https://arxiv.org/pdf/2310.13289.pdf",
    "HF_Link": "",
    "Demo_Link": "https://huggingface.co/spaces/tsinghua-ee/SALMONN-7B-gradio",
    "Other_Link": "",
    "Audio_Input": "Yes",
    "Audio_Output": "No",
    "Language": "Multilingual",
    "Description": "SALMONN is a model developed by Tsinghua University aiming to equip large language models with generic hearing abilities, enhancing their capacity to process and understand diverse audio inputs."
  },
  {
    "Category": "Benchmark",
    "Type": "Benchmark",
    "Abbreviation": "SpokenWOZ",
    "Title": "SpokenWOZ: A Large-Scale Speech-Text Benchmark for Spoken Task-Oriented Dialogue Agents",
    "Time": "2024-03",
    "Affiliation": "Tencent",
    "Author": "Shuzheng Si, Wentao Ma, Haoyu Gao, Yuchuan Wu, Ting-En Lin, Yinpei Dai, Hangyu Li, Rui Yan, Fei Huang, Yongbin Li",
    "GitHub_Link": "https://github.com/AlibabaResearch/DAMO-ConvAI/tree/main/spokenwoz",
    "Paper_Link": "https://arxiv.org/abs/2305.13040",
    "HF_Link": "",
    "Demo_Link": "https://spokenwoz.github.io/",
    "Other_Link": "",
    "Audio_Input": "-",
    "Audio_Output": "-",
    "Language": "English",
    "Description": "Task-oriented dialogue (TOD) models have made significant progress in recent years. However, previous studies primarily focus on datasets written by annotators, which has resulted in a gap between academic research and real-world spoken conversation scenarios. While several small-scale spoken TOD datasets are proposed to address robustness issues such as ASR errors, they ignore the unique challenges in spoken conversation. To tackle the limitations, we introduce SpokenWOZ, a large-scale speech-text dataset for spoken TOD, containing 8 domains, 203k turns, 5.7k dialogues and 249 hours of audios from human-to-human spoken conversations. SpokenWOZ further incorporates common spoken characteristics such as word-by-word processing and reasoning in spoken language. Based on these characteristics, we present cross-turn slot and reasoning slot detection as new challenges. We conduct experiments on various baselines, including text-modal models, newly proposed dual-modal models, and LLMs, e.g., ChatGPT. The results show that the current models still have substantial room for improvement in spoken conversation, where the most advanced dialogue state tracker only achieves 25.65% in joint goal accuracy and the SOTA end-to-end model only correctly completes the user request in 52.1% of dialogues. The dataset, code, and leaderboard are available: this https URL."
  },
  {
    "Category": "Model and Methods",
    "Type": "Model",
    "Abbreviation": "WavLLM",
    "Title": "WavLLM: Towards Robust and Adaptive Speech Large Language Model",
    "Time": "2024-03",
    "Affiliation": "CUHK",
    "Author": "Authors not specified in the provided information",
    "GitHub_Link": "https://github.com/microsoft/SpeechT5/tree/main/WavLLM",
    "Paper_Link": "https://arxiv.org/pdf/2404.00656",
    "HF_Link": "",
    "Demo_Link": "",
    "Other_Link": "",
    "Audio_Input": "Yes",
    "Audio_Output": "No",
    "Language": "Multilingual",
    "Description": "WavLLM is a speech large language model developed by CUHK, designed to be robust and adaptive across various speech processing tasks, supporting multilingual audio inputs for comprehensive language understanding."
  },
  {
    "Category": "Survey",
    "Type": "Survey",
    "Abbreviation": "AudioLM-Survey",
    "Title": "Towards audio language modeling -- an overview",
    "Time": "2024-02",
    "Affiliation": "National Taiwan University, MIT",
    "Author": "Haibin Wu, Xuanjun Chen, Yi-Cheng Lin, Kai-wei Chang, Ho-Lam Chung, Alexander H. Liu, Hung-yi Lee",
    "GitHub_Link": "",
    "Paper_Link": "https://arxiv.org/abs/2402.13236",
    "HF_Link": "",
    "Demo_Link": "",
    "Other_Link": "",
    "Audio_Input": "No",
    "Audio_Output": "No",
    "Language": "",
    "Description": ""
  },
  {
    "Category": "Model and Methods",
    "Type": "Model",
    "Abbreviation": "SLAM-LLM",
    "Title": "An Embarrassingly Simple Approach for LLM with Strong ASR Capacity",
    "Time": "2024-02",
    "Affiliation": "Shanghai Jiao Tong University (SJTU)",
    "Author": "Authors not specified in the provided information",
    "GitHub_Link": "https://github.com/X-LANCE/SLAM-LLM",
    "Paper_Link": "https://arxiv.org/pdf/2402.08846",
    "HF_Link": "",
    "Demo_Link": "",
    "Other_Link": "",
    "Audio_Input": "Yes",
    "Audio_Output": "No",
    "Language": "Multilingual",
    "Description": "SLAM-LLM is a model developed by SJTU that integrates large language models with strong automatic speech recognition (ASR) capabilities, providing a simple yet effective approach for speech-to-text tasks across multiple languages."
  },
  {
    "Category": "Model and Methods",
    "Type": "Model",
    "Abbreviation": "Pengi",
    "Title": "Pengi: An Audio Language Model for Audio Tasks",
    "Time": "2024-01",
    "Affiliation": "Microsoft",
    "Author": "Authors not specified in the provided information",
    "GitHub_Link": "https://github.com/microsoft/Pengi",
    "Paper_Link": "https://arxiv.org/pdf/2305.11834.pdf",
    "HF_Link": "",
    "Demo_Link": "",
    "Other_Link": "",
    "Audio_Input": "Yes",
    "Audio_Output": "Yes",
    "Language": "Multilingual",
    "Description": "Pengi is an audio language model developed by Microsoft, designed to handle various audio tasks by processing and generating audio inputs and outputs across multiple languages."
  },
  {
    "Category": "Model and Methods",
    "Type": "Model",
    "Abbreviation": "Qwen-Audio",
    "Title": "Qwen-Audio: Advancing Universal Audio Understanding via Unified Large-Scale Audio-Language Models",
    "Time": "2023-12",
    "Affiliation": "Alibaba",
    "Author": "Authors not specified in the provided information",
    "GitHub_Link": "https://github.com/QwenLM/Qwen-Audio",
    "Paper_Link": "https://arxiv.org/pdf/2311.07919.pdf",
    "HF_Link": "",
    "Demo_Link": "https://qwen-audio.github.io/Qwen-Audio/",
    "Other_Link": "",
    "Audio_Input": "Yes",
    "Audio_Output": "Yes",
    "Language": "Multilingual",
    "Description": "Qwen-Audio is a large-scale audio-language model developed by Alibaba, aiming to advance universal audio understanding by integrating audio and language processing capabilities in a unified framework."
  },
  {
    "Category": "Multimodal",
    "Type": "Model",
    "Abbreviation": "CoDi-2",
    "Title": "CoDi-2: In-Context, Interleaved, and Interactive Any-to-Any Generation",
    "Time": "2023-11",
    "Affiliation": "UC Berkeley",
    "Author": "Zineng Tang, Ziyi Yang, Mahmoud Khademi, Yang Liu, Chenguang Zhu, Mohit Bansal",
    "GitHub_Link": "https://github.com/microsoft/i-Code/tree/main/CoDi-2",
    "Paper_Link": "https://arxiv.org/pdf/2311.18775",
    "HF_Link": "",
    "Demo_Link": "https://codi-2.github.io/",
    "Other_Link": "",
    "Audio_Input": "Yes",
    "Audio_Output": "Yes",
    "Language": "English",
    "Description": ""
  },
  {
    "Category": "Model and Methods",
    "Type": "Model",
    "Abbreviation": "OpenJMLA",
    "Title": "Joint Music and Language Attention Models for Zero-shot Music Tagging",
    "Time": "2023-10",
    "Affiliation": "ByteDance",
    "Author": "Xingjian Du, Zhesong Yu, Jiaju Lin, Bilei Zhu, Qiuqiang Kong",
    "GitHub_Link": "",
    "Paper_Link": "https://arxiv.org/abs/2310.10159",
    "HF_Link": "https://huggingface.co/UniMus/OpenJMLA",
    "Demo_Link": "",
    "Other_Link": "",
    "Audio_Input": "Yes",
    "Audio_Output": "No",
    "Language": "English",
    "Description": "JMLA (Joint Music and Language Attention) introduces an open-set music tagging model that combines a pretrained music encoder with a language model via attention, enabling zero-shot tagging on arbitrary tag vocabularies rather than fixed closed-set labels."
  },
  {
    "Category": "Model and Methods",
    "Type": "Model",
    "Abbreviation": "UniAudio",
    "Title": "An Audio Foundation Model Toward Universal Audio Generation",
    "Time": "2023-10",
    "Affiliation": "Chinese University of Hong Kong (CUHK)",
    "Author": "Authors not specified in the provided information",
    "GitHub_Link": "https://github.com/yangdongchao/UniAudio",
    "Paper_Link": "https://arxiv.org/abs/2310.00704",
    "HF_Link": "",
    "Demo_Link": "https://dongchaoyang.top/UniAudio_demo/",
    "Other_Link": "",
    "Audio_Input": "Yes",
    "Audio_Output": "Yes",
    "Language": "Multilingual",
    "Description": "UniAudio is an audio foundation model developed by CUHK, aiming toward universal audio generation by supporting various audio generation tasks, including speech, sound, music, and singing voice, based on diverse input conditions."
  },
  {
    "Category": "Benchmark",
    "Type": "Benchmark",
    "Abbreviation": "Dynamic-SUPERB",
    "Title": "Dynamic-SUPERB: Towards A Dynamic, Collaborative, and Comprehensive Instruction-Tuning Benchmark for Speech",
    "Time": "2023-09",
    "Affiliation": "NTU-Taiwan, etc.",
    "Author": "Chien-yu Huang, Ke-Han Lu, Shih-Heng Wang, Chi-Yuan Hsiao, Chun-Yi Kuan, Haibin Wu, Siddhant Arora, Kai-Wei Chang, Jiatong Shi, Yifan Peng, Roshan Sharma, Shinji Watanabe, Bhiksha Ramakrishnan, Shady Shehata, Hung-yi Lee",
    "GitHub_Link": "https://github.com/dynamic-superb/dynamic-superb",
    "Paper_Link": "https://arxiv.org/abs/2309.09510",
    "HF_Link": "",
    "Demo_Link": "",
    "Other_Link": "",
    "Audio_Input": "Yes",
    "Audio_Output": "Yes",
    "Language": "",
    "Description": ""
  },
  {
    "Category": "Model and Methods",
    "Type": "Model",
    "Abbreviation": "LLaSM",
    "Title": "LLaSM: Large Language and Speech Model",
    "Time": "2023-09",
    "Affiliation": "LinkSoul.AI",
    "Author": "Authors not specified in the provided information",
    "GitHub_Link": "https://github.com/LinkSoul-AI/LLaSM",
    "Paper_Link": "https://arxiv.org/pdf/2308.15930.pdf",
    "HF_Link": "",
    "Demo_Link": "",
    "Other_Link": "",
    "Audio_Input": "Yes",
    "Audio_Output": "No",
    "Language": "Bilingual (Chinese and English)",
    "Description": "LLaSM is a large language and speech model developed by LinkSoul.AI, supporting bilingual (Chinese and English) speech-text multimodal dialogues. It offers convenient speech input, enhancing user experience by avoiding the complexities and potential errors associated with ASR-based solutions."
  },
  {
    "Category": "Model and Methods",
    "Type": "Model",
    "Abbreviation": "LTU-AS",
    "Title": "Joint Audio and Speech Understanding",
    "Time": "2023-09",
    "Affiliation": "MIT, IBM Research",
    "Author": "Yuan Gong, Alexander H. Liu, Hongyin Luo, Leonid Karlinsky, James Glass",
    "GitHub_Link": "https://github.com/YuanGongND/ltu",
    "Paper_Link": "https://arxiv.org/abs/2309.14405",
    "HF_Link": "https://huggingface.co/spaces/yuangongfdu/ltu-2",
    "Demo_Link": "https://huggingface.co/spaces/yuangongfdu/ltu-2",
    "Other_Link": "",
    "Audio_Input": "Yes",
    "Audio_Output": "No",
    "Language": "English",
    "Description": "LTU-AS (Listen, Think, and Understand — Audio and Speech) extends the LTU model to jointly handle non-speech audio and speech understanding by combining a Whisper-style speech encoder with an audio event encoder, feeding both into an LLM for unified reasoning over audio inputs."
  },
  {
    "Category": "Model and Methods",
    "Type": "Model",
    "Abbreviation": "Segment-level Q-Former",
    "Title": "Connecting Speech Encoder and Large Language Model for ASR",
    "Time": "2023-09",
    "Affiliation": "Tsinghua University, ByteDance",
    "Author": "Wenyi Yu, Changli Tang, Guangzhi Sun, Xianzhao Chen, Tian Tan, Wei Li, Lu Lu, Zejun Ma, Chao Zhang",
    "GitHub_Link": "",
    "Paper_Link": "https://arxiv.org/pdf/2309.13963",
    "HF_Link": "",
    "Demo_Link": "",
    "Other_Link": "",
    "Audio_Input": "Yes",
    "Audio_Output": "No",
    "Language": "",
    "Description": "This paper presents a comparative study of three connector structures—fully connected layers, multi-head cross-attention, and Q-Former—for integrating speech encoders with large language models (LLMs) in automatic speech recognition (ASR) systems. The study finds that LLMs with Q-Formers achieve consistent and significant word error rate reductions over other connector structures. Additionally, a novel segment-level Q-Former is proposed to enable LLMs to recognize longer speech segments, resulting in further performance improvements."
  },
  {
    "Category": "Audio Generation",
    "Type": "Audio Generation Model",
    "Abbreviation": "AudioLDM 2",
    "Title": "AudioLDM 2: Learning Holistic Audio Generation with Self-supervised Pretraining",
    "Time": "2023-08",
    "Affiliation": "University of Surrey, Imperial College London",
    "Author": "Haohe Liu, Qiao Tian, Yi Yuan, Xubo Liu, Xinhao Mei, Qiuqiang Kong, Yuping Wang, Wenwu Wang, Yuxuan Wang, Mark D. Plumbley",
    "GitHub_Link": "https://github.com/haoheliu/AudioLDM2",
    "Paper_Link": "https://arxiv.org/abs/2308.05734",
    "HF_Link": "https://huggingface.co/cvssp/audioldm2",
    "Demo_Link": "https://audioldm.github.io/audioldm2/",
    "Other_Link": "",
    "Audio_Input": "No",
    "Audio_Output": "Yes",
    "Language": "English",
    "Description": "AudioLDM 2 unifies speech, sound effects, and music generation in a single latent diffusion framework by introducing a shared 'language of audio' learnt from self-supervised pretraining, enabling holistic high-quality audio generation from text."
  },
  {
    "Category": "Speech Recognition",
    "Type": "Speech Translation Model",
    "Abbreviation": "SeamlessM4T",
    "Title": "SeamlessM4T: Massively Multilingual & Multimodal Machine Translation",
    "Time": "2023-08",
    "Affiliation": "Meta AI",
    "Author": "Seamless Communication Team (Loïc Barrault, Yu-An Chung, Mariano Coria Meglioli, et al.)",
    "GitHub_Link": "https://github.com/facebookresearch/seamless_communication",
    "Paper_Link": "https://arxiv.org/abs/2308.11596",
    "HF_Link": "https://huggingface.co/facebook/seamless-m4t-v2-large",
    "Demo_Link": "https://seamless.metademolab.com/",
    "Other_Link": "",
    "Audio_Input": "Yes",
    "Audio_Output": "Yes",
    "Language": "Multilingual",
    "Description": "SeamlessM4T is Meta's unified multilingual multimodal translation model covering ASR, speech-to-text translation, speech-to-speech translation, text-to-text and text-to-speech across nearly 100 input and 35+ output languages in a single system."
  },
  {
    "Category": "Model and Methods",
    "Type": "Model",
    "Abbreviation": "Prompting LLMs with Speech Recognition",
    "Title": "Prompting Large Language Models with Speech Recognition Abilities",
    "Time": "2023-07",
    "Affiliation": "Meta",
    "Author": "Yassir Fathullah, Chunyang Wu, Egor Lakomkin, Junteng Jia, Yuan Shangguan, Ke Li, Jinxi Guo, Wenhan Xiong, Jay Mahadeokar, Ozlem Kalinli, Christian Fuegen, Mike Seltzer",
    "GitHub_Link": "",
    "Paper_Link": "https://arxiv.org/pdf/2307.11795",
    "HF_Link": "",
    "Demo_Link": "",
    "Other_Link": "",
    "Audio_Input": "Yes",
    "Audio_Output": "No",
    "Language": "",
    "Description": "This paper presents a method to extend large language models (LLMs) with speech recognition capabilities by integrating a small audio encoder. By prepending audio embeddings to text token embeddings, the LLM can function as an automatic speech recognition (ASR) system. Experiments demonstrate that incorporating a conformer encoder into the LLaMA-7B model enables it to outperform monolingual baselines and perform multilingual speech recognition, despite being predominantly trained on English text."
  },
  {
    "Category": "Audio Generation",
    "Type": "Neural Audio Codec",
    "Abbreviation": "DAC",
    "Title": "High-Fidelity Audio Compression with Improved RVQGAN",
    "Time": "2023-06",
    "Affiliation": "Descript",
    "Author": "Rithesh Kumar, Prem Seetharaman, Alejandro Luebs, Ishaan Kumar, Kundan Kumar",
    "GitHub_Link": "https://github.com/descriptinc/descript-audio-codec",
    "Paper_Link": "https://arxiv.org/abs/2306.06546",
    "HF_Link": "",
    "Demo_Link": "",
    "Other_Link": "",
    "Audio_Input": "Yes",
    "Audio_Output": "Yes",
    "Language": "Multilingual",
    "Description": "Descript Audio Codec (DAC) is a high-fidelity universal 44.1 kHz neural audio codec achieving ~90x compression with substantially better quality than EnCodec, widely used as the discrete tokenizer for downstream audio generation models."
  },
  {
    "Category": "Multimodal",
    "Type": "Model",
    "Abbreviation": "Macaw-LLM",
    "Title": "Macaw-LLM: Multi-Modal Language Modeling with Image, Video, Audio, and Text Integration",
    "Time": "2023-06",
    "Affiliation": "Tencent",
    "Author": "Chenyang Lyu, Minghao Wu, Longyue Wang, Xinting Huang, Bingshuai Liu, Zefeng Du, Shuming Shi, Zhaopeng Tu",
    "GitHub_Link": "https://github.com/lyuchenyang/Macaw-LLM",
    "Paper_Link": "https://arxiv.org/pdf/2306.09093",
    "HF_Link": "",
    "Demo_Link": "",
    "Other_Link": "",
    "Audio_Input": "Yes",
    "Audio_Output": "No",
    "Language": "Multilingual",
    "Description": ""
  },
  {
    "Category": "Audio Generation",
    "Type": "Music Generation Model",
    "Abbreviation": "MusicGen",
    "Title": "Simple and Controllable Music Generation",
    "Time": "2023-06",
    "Affiliation": "Meta AI",
    "Author": "Jade Copet, Felix Kreuk, Itai Gat, Tal Remez, David Kant, Gabriel Synnaeve, Yossi Adi, Alexandre Défossez",
    "GitHub_Link": "https://github.com/facebookresearch/audiocraft",
    "Paper_Link": "https://arxiv.org/abs/2306.05284",
    "HF_Link": "https://huggingface.co/facebook/musicgen-large",
    "Demo_Link": "https://huggingface.co/spaces/facebook/MusicGen",
    "Other_Link": "",
    "Audio_Input": "Yes",
    "Audio_Output": "Yes",
    "Language": "English",
    "Description": "MusicGen is Meta's single-stage autoregressive transformer for controllable text-conditioned music generation, operating over discrete EnCodec tokens with optional melody conditioning. Part of the AudioCraft suite."
  },
  {
    "Category": "Speech Synthesis",
    "Type": "TTS Model",
    "Abbreviation": "StyleTTS 2",
    "Title": "StyleTTS 2: Towards Human-Level Text-to-Speech through Style Diffusion and Adversarial Training with Large Speech Language Models",
    "Time": "2023-06",
    "Affiliation": "Columbia University",
    "Author": "Yinghao Aaron Li, Cong Han, Vinay S. Raghavan, Gavin Mischler, Nima Mesgarani",
    "GitHub_Link": "https://github.com/yl4579/StyleTTS2",
    "Paper_Link": "https://arxiv.org/abs/2306.07691",
    "HF_Link": "https://huggingface.co/spaces/styletts2/styletts2",
    "Demo_Link": "https://styletts2.github.io/",
    "Other_Link": "",
    "Audio_Input": "Yes",
    "Audio_Output": "Yes",
    "Language": "English",
    "Description": "StyleTTS 2 models speech styles as a latent random variable through diffusion and adversarial training with large speech language models, achieving human-level naturalness on LJSpeech and strong zero-shot speaker cloning."
  },
  {
    "Category": "Speech Recognition",
    "Type": "Speech Recognition Toolkit",
    "Abbreviation": "FunASR",
    "Title": "FunASR: A Fundamental End-to-End Speech Recognition Toolkit",
    "Time": "2023-05",
    "Affiliation": "Speech Lab, Alibaba DAMO Academy",
    "Author": "Zhifu Gao, Zerui Li, Jiaming Wang, Haoneng Luo, Xian Shi, Mengzhe Chen, Yabin Li, Lingyun Zuo, Zhihao Du, Zhangyu Xiao, Shiliang Zhang",
    "GitHub_Link": "https://github.com/modelscope/FunASR",
    "Paper_Link": "https://arxiv.org/abs/2305.11013",
    "HF_Link": "",
    "Demo_Link": "",
    "Other_Link": "",
    "Audio_Input": "Yes",
    "Audio_Output": "No",
    "Language": "Multilingual",
    "Description": "FunASR is an industrial-grade open-source speech recognition toolkit from Alibaba's Speech Lab that bridges academic research and production deployment. It ships pretrained models including the non-autoregressive Paraformer (SOTA CER on many Mandarin benchmarks), FSMN-VAD, punctuation restoration, CAM++ speaker diarization, timestamp prediction, and streaming recognition across 50+ languages."
  },
  {
    "Category": "Speech Recognition",
    "Type": "Speech Recognition Model",
    "Abbreviation": "MMS",
    "Title": "Scaling Speech Technology to 1,000+ Languages",
    "Time": "2023-05",
    "Affiliation": "Meta AI",
    "Author": "Vineel Pratap, Andros Tjandra, Bowen Shi, Paden Tomasello, Arun Babu, Sayani Kundu, Ali Elkahky, Zhaoheng Ni, Apoorv Vyas, Maryam Fazel-Zarandi, Alexei Baevski, Yossi Adi, Xiaohui Zhang, Wei-Ning Hsu, Alexis Conneau, Michael Auli",
    "GitHub_Link": "https://github.com/facebookresearch/fairseq/tree/main/examples/mms",
    "Paper_Link": "https://arxiv.org/abs/2305.13516",
    "HF_Link": "https://huggingface.co/facebook/mms-1b-all",
    "Demo_Link": "",
    "Other_Link": "",
    "Audio_Input": "Yes",
    "Audio_Output": "Yes",
    "Language": "Multilingual",
    "Description": "MMS (Massively Multilingual Speech) extends speech foundation models (wav2vec 2.0) to 1,107 languages for ASR and adds TTS and language identification for 1,400+ languages, dramatically expanding speech coverage beyond the previously dominant ~100 languages."
  },
  {
    "Category": "Model and Methods",
    "Type": "Model",
    "Abbreviation": "SpeechGPT",
    "Title": "SpeechGPT: Empowering Large Language Models with Intrinsic Cross-Modal Conversational Abilities",
    "Time": "2023-05",
    "Affiliation": "Fudan University",
    "Author": "Dong Zhang, Shimin Li, Xin Zhang, Jun Zhan, Pengyu Wang, Yaqian Zhou, Xipeng Qiu",
    "GitHub_Link": "https://github.com/0nutation/SpeechGPT/tree/main/speechgpt",
    "Paper_Link": "https://arxiv.org/pdf/2305.11000.pdf",
    "HF_Link": "",
    "Demo_Link": "https://0nutation.github.io/SpeechGPT.github.io/",
    "Other_Link": "",
    "Audio_Input": "Yes",
    "Audio_Output": "Yes",
    "Language": "",
    "Description": "SpeechGPT is a multimodal large language model developed by Fudan University, capable of perceiving and generating multimodal content following human instructions. It integrates cross-modal conversational abilities, enabling it to handle tasks involving speech and text seamlessly."
  },
  {
    "Category": "Model and Methods",
    "Type": "Model",
    "Abbreviation": "AudioGPT",
    "Title": "AudioGPT: Understanding and Generating Speech, Music, Sound, and Talking Head",
    "Time": "2023-04",
    "Affiliation": "Zhejiang University",
    "Author": "Rongjie Huang, Mingze Li, Dongchao Yang, Jiatong Shi, Xuankai Chang, Zhenhui Ye, Yuning Wu, Zhiqing Hong, Jiawei Huang, Jinglin Liu, Yi Ren, Zhou Zhao, Shinji Watanabe",
    "GitHub_Link": "https://github.com/AIGC-Audio/AudioGPT",
    "Paper_Link": "https://arxiv.org/pdf/2304.12995.pdf",
    "HF_Link": "",
    "Demo_Link": "",
    "Other_Link": "",
    "Audio_Input": "Yes",
    "Audio_Output": "Yes",
    "Language": "",
    "Description": "AudioGPT is a multimodal AI system that integrates Large Language Models (LLMs) with foundation models to process complex audio information, enabling tasks such as understanding and generating speech, music, sound, and talking head. It supports spoken dialogue through ASR and TTS interfaces, facilitating human-like interactions and content creation."
  },
  {
    "Category": "Speech Synthesis",
    "Type": "TTS Model",
    "Abbreviation": "VALL-E",
    "Title": "Neural Codec Language Models are Zero-Shot Text to Speech Synthesizers",
    "Time": "2023-01",
    "Affiliation": "Microsoft",
    "Author": "Chengyi Wang, Sanyuan Chen, Yu Wu, Ziqiang Zhang, Long Zhou, Shujie Liu, Zhuo Chen, Yanqing Liu, Huaming Wang, Jinyu Li, Lei He, Sheng Zhao, Furu Wei",
    "GitHub_Link": "https://github.com/microsoft/unilm/tree/master/valle",
    "Paper_Link": "https://arxiv.org/abs/2301.02111",
    "HF_Link": "",
    "Demo_Link": "https://www.microsoft.com/en-us/research/project/vall-e-x/",
    "Other_Link": "",
    "Audio_Input": "Yes",
    "Audio_Output": "Yes",
    "Language": "English",
    "Description": "VALL-E reframes text-to-speech as a conditional language modeling task over discrete audio codec tokens (EnCodec), enabling zero-shot voice cloning from a 3-second enrollment recording with strong speaker similarity and prosody."
  },
  {
    "Category": "Speech Recognition",
    "Type": "Speech Recognition Model",
    "Abbreviation": "Whisper",
    "Title": "Robust Speech Recognition via Large-Scale Weak Supervision",
    "Time": "2022-12",
    "Affiliation": "OpenAI",
    "Author": "Alec Radford, Jong Wook Kim, Tao Xu, Greg Brockman, Christine McLeavey, Ilya Sutskever",
    "GitHub_Link": "https://github.com/openai/whisper",
    "Paper_Link": "https://arxiv.org/abs/2212.04356",
    "HF_Link": "https://huggingface.co/openai/whisper-large-v3",
    "Demo_Link": "",
    "Other_Link": "",
    "Audio_Input": "Yes",
    "Audio_Output": "No",
    "Language": "Multilingual",
    "Description": "Whisper is OpenAI's open-source speech recognition model trained on 680K hours of multilingual and multitask supervised data from the web. It performs robust transcription, translation to English, and language identification across 99 languages."
  },
  {
    "Category": "Audio Generation",
    "Type": "Neural Audio Codec",
    "Abbreviation": "EnCodec",
    "Title": "High Fidelity Neural Audio Compression",
    "Time": "2022-10",
    "Affiliation": "Meta AI",
    "Author": "Alexandre Défossez, Jade Copet, Gabriel Synnaeve, Yossi Adi",
    "GitHub_Link": "https://github.com/facebookresearch/encodec",
    "Paper_Link": "https://arxiv.org/abs/2210.13438",
    "HF_Link": "https://huggingface.co/facebook/encodec_24khz",
    "Demo_Link": "",
    "Other_Link": "",
    "Audio_Input": "Yes",
    "Audio_Output": "Yes",
    "Language": "Multilingual",
    "Description": "EnCodec is Meta's streaming neural audio codec that compresses 24/48 kHz audio with high perceptual fidelity using a residual vector quantizer. Its discrete tokens are the foundation for MusicGen, AudioGen, and VALL-E."
  },
  {
    "Category": "Audio Generation",
    "Type": "Audio Generation Model",
    "Abbreviation": "AudioGen",
    "Title": "AudioGen: Textually Guided Audio Generation",
    "Time": "2022-09",
    "Affiliation": "Meta AI, Hebrew University of Jerusalem",
    "Author": "Felix Kreuk, Gabriel Synnaeve, Adam Polyak, Uriel Singer, Alexandre Défossez, Jade Copet, Devi Parikh, Yaniv Taigman, Yossi Adi",
    "GitHub_Link": "https://github.com/facebookresearch/audiocraft",
    "Paper_Link": "https://arxiv.org/abs/2209.15352",
    "HF_Link": "https://huggingface.co/facebook/audiogen-medium",
    "Demo_Link": "https://felixkreuk.github.io/audiogen/",
    "Other_Link": "",
    "Audio_Input": "No",
    "Audio_Output": "Yes",
    "Language": "English",
    "Description": "AudioGen is a transformer-based autoregressive model for text-to-environmental-sound generation, trained on discrete audio tokens. It established the recipe later used by MusicGen and is part of Meta's AudioCraft."
  }
]