Publications | Chien-Yu Lin

2025

xKV: Cross-Layer SVD for KV-Cache Compression

Chi-Chih Chang, Chien-Yu Lin, Yash Akhauri, and 4 more authors

2025

@misc{chang2025xkv,
  title = {xKV: Cross-Layer SVD for KV-Cache Compression },
  author = {Chang, Chi-Chih and Lin, Chien-Yu and Akhauri, Yash and Lin, Wei-Cheng and Wu, Kai-Chiang and Ceze, Luis and Abdelfattah, Mohamed S.},
  journal = {ArXiv},
  year = {2025},
  eprint = {2503.18893},
}

xKV: Cross-Layer SVD for KV-Cache Compression

2025

TeleRAG: Efficient Retrieval-Augmented Generation Inference with Lookahead Retrieval

Chien-Yu Lin^*, Keisuke Kamahori^*, Yiyu Liu, and 11 more authors

2025

Bib PDF

@article{lin2025telerag,
  title = {TeleRAG: Efficient Retrieval-Augmented Generation Inference with Lookahead Retrieval},
  author = {Lin, Chien-Yu and Kamahori, Keisuke and Liu, Yiyu and Shi, Xiaoxiang and Kashyap, Madhav and Gu, Yile and Shao, Rulin and Ye, Zihao and Zhu, Kan and Wang, Stephanie and Krishnamurthy, Arvind and Kadekodi, Rohan and Ceze, Luis and Kasikci, Baris},
  venue_url = {https://arxiv.org/abs/2502.20969},
  year = {2025},
}

Palu: Compressing KV-Cache with Low-Rank Projection

Chi-Chih Chang^*, Wei-Cheng Lin^*, Chien-Yu Lin^*, and 7 more authors

In Proceedings of International Conference on Learning Representations (ICLR), 2025

Bib PDF

@inproceedings{chang2024palu,
  author = {Chang, Chi-Chih and Lin, Wei-Cheng and Lin, Chien-Yu and Chen, Chong-Yan and Hu, Yu-Fang and Wang, Pei-Shuo and Huang, Ning-Chi and Ceze, Luis and Abdelfattah, Mohamed S. and Wu, Kai-Chiang},
  booktitle = {Proceedings of International Conference on Learning Representations (ICLR)},
  title = {Palu: Compressing KV-Cache with Low-Rank Projection},
  year = {2025},
  venue_url = {https://arxiv.org/abs/2407.21118},
}

NanoFlow: Towards Optimal Large Language Model Serving Throughput

Kan Zhu, Yilong Zhao, Liangyu Zhao, and 12 more authors

In 19th USENIX Symposium on Operating Systems Design and Implementation (OSDI) 2025, 2025

Bib PDF Code

@inproceedings{zhu2024nanoflow,
  title = {NanoFlow: Towards Optimal Large Language Model Serving Throughput},
  author = {Zhu, Kan and Zhao, Yilong and Zhao, Liangyu and Zuo, Gefei and Gu, Yile and Xie, Dedong and Gao, Yufei and Xu, Qinyu and Tang, Tian and Ye, Zihao and Kamahori, Keisuke and Lin, Chien-Yu and Wang, Stephanie and Krishnamurthy, Arvind and Kasikci, Baris},
  booktitle = {19th USENIX Symposium on Operating Systems Design and Implementation (OSDI) 2025},
  year = {2025},
  eprint = {2408.12757},
}

2024

Efficient Encoder-Decoder Transformer Decoding for Decomposable Tasks

Bo-Ru Lu, Nikita Haduong, Chien-Yu Lin, and 3 more authors

arXiv preprint arXiv:2403.13112, 2024

Bib PDF

@article{lu2024efficient,
  title = {Efficient Encoder-Decoder Transformer Decoding for Decomposable Tasks},
  author = {Lu, Bo-Ru and Haduong, Nikita and Lin, Chien-Yu and Cheng, Hao and Smith, Noah A. and Ostendorf, Mari},
  journal = {arXiv preprint arXiv:2403.13112},
  year = {2024},
}

Atom: Low-Bit Quantization for Efficient and Accurate LLM Serving

Yilong Zhao, Chien-Yu Lin, Kan Zhu, and 7 more authors

In Proceedings of Machine Learning and Systems (MLSys), 2024

Bib PDF Code

@inproceedings{zhao2024atom,
  author = {Zhao, Yilong and Lin, Chien-Yu and Zhu, Kan and Ye, Zihao and Chen, Lequn and Zheng, Size and Ceze, Luis and Krishnamurthy, Arvind and Chen, Tianqi and Kasikci, Baris},
  booktitle = {Proceedings of Machine Learning and Systems (MLSys)},
  pages = {196--209},
  title = {Atom: Low-Bit Quantization for Efficient and Accurate LLM Serving},
  url = {https://proceedings.mlsys.org/paper_files/paper/2024/file/5edb57c05c81d04beb716ef1d542fe9e-Paper-Conference.pdf},
  volume = {6},
  year = {2024},
}

FastSR-NeRF: Improving NeRF Efficiency on Consumer Devices with A Simple Super-Resolution Pipeline

Chien-Yu Lin, Qichen Fu, Thomas Merth, and 2 more authors

In Proceedings of the IEEE/CVF Winter Conference on Applications of Computer Vision (WACV), Jan 2024

Oral (Top 2.6%)

Bib PDF

@inproceedings{lin2024fastsrnerf,
  author = {Lin, Chien-Yu and Fu, Qichen and Merth, Thomas and Yang, Karren and Ranjan, Anurag},
  title = {FastSR-NeRF: Improving NeRF Efficiency on Consumer Devices with A Simple Super-Resolution Pipeline},
  booktitle = {Proceedings of the IEEE/CVF Winter Conference on Applications of Computer Vision (WACV)},
  month = jan,
  year = {2024},
  pages = {2482-2491},
  venue_type = {Conference},
}

2022

SPIN: An Empirical Evaluation on Sharing Parameters of Isotropic Networks

Chien-Yu Lin^*, Anish Prabhu^*, Thomas Merth, and 4 more authors

In Proceedings the 17th European Conference on Computer Vision (ECCV), Jan 2022

Bib PDF Video Code

@inproceedings{lin2022spin,
  author = {Lin, Chien-Yu and Prabhu, Anish and Merth, Thomas and Mehta, Sachin and Ranjan, Anurag and Horton, Maxwell and Rastegari, Mohammad},
  title = {SPIN: An Empirical Evaluation on Sharing Parameters of Isotropic Networks},
  booktitle = {Proceedings the 17th European Conference on Computer Vision (ECCV)},
  year = {2022},
  venue_type = {Conference},
}

2021

Accelerating Spmm Kernel with Cache-First Edge Sampling for Graph Neural Networks

Chien-Yu Lin, Liang Luo, and Luis Ceze

arXiv preprint arXiv:2104.10716, Jan 2021

Bib PDF

@article{lin2021esspmm,
  title = {Accelerating Spmm Kernel with Cache-First Edge Sampling for Graph Neural Networks},
  author = {Lin, Chien-Yu and Luo, Liang and Ceze, Luis},
  journal = {arXiv preprint arXiv:2104.10716},
  year = {2021},
  venue_type = {ArXiv},
}

2019

Enhancing Utilization of SIMD-Like Accelerator for Sparse Convolutional Neural Networks

Bo-Cheng Lai, Jyun-Wei Pan, and Chien-Yu Lin

IEEE Transactions on Very Large Scale Integration (VLSI) Systems, Jan 2019

Bib PDF

@article{lai2019enhancing,
  title = {Enhancing Utilization of SIMD-Like Accelerator for Sparse Convolutional Neural Networks},
  author = {Lai, Bo-Cheng and Pan, Jyun-Wei and Lin, Chien-Yu},
  journal = {IEEE Transactions on Very Large Scale Integration (VLSI) Systems},
  year = {2019},
}

2018

Supporting compressed-sparse activations and weights on SIMD-like accelerator for sparse convolutional neural networks

Chien-Yu Lin, and Bo-Cheng Lai

In Proceedings of the 23rd Asia and South Pacific Design Automation Conference (ASP-DAC), Jan 2018

Bib PDF Slides

@inproceedings{lin2018supporting,
  author = {Lin, Chien-Yu and Lai, Bo-Cheng},
  title = {Supporting compressed-sparse activations and weights on SIMD-like accelerator for sparse convolutional neural networks},
  booktitle = {Proceedings of the 23rd Asia and South Pacific Design Automation Conference (ASP-DAC)},
  year = {2018},
}