@inproceedings{jawahar-etal-2024-llm,
title = "{LLM} Performance Predictors are good initializers for Architecture Search",
author = "Jawahar, Ganesh and
Abdul-Mageed, Muhammad and
Lakshmanan, Laks and
Ding, Dujian",
editor = "Ku, Lun-Wei and
Martins, Andre and
Srikumar, Vivek",
booktitle = "Findings of the Association for Computational Linguistics: ACL 2024",
month = aug,
year = "2024",
address = "Bangkok, Thailand",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.findings-acl.627",
doi = "10.18653/v1/2024.findings-acl.627",
pages = "10540--10560",
abstract = "In this work, we utilize Large Language Models (LLMs) for a novel use case: constructing Performance Predictors (PP) that estimate the performance of specific deep neural network architectures on downstream tasks. We create PP prompts for LLMs, comprising (i) role descriptions, (ii) instructions for the LLM, (iii) hyperparameter definitions, and (iv) demonstrations presenting sample architectures with efficiency metrics and {`}training from scratch{'} performance. In machine translation (MT) tasks, GPT-4 with our PP prompts (LLM-PP) achieves a SoTA mean absolute error and a slight degradation in rank correlation coefficient compared to baseline predictors. Additionally, we demonstrate that predictions from LLM-PP can be distilled to a compact regression model (LLM-Distill-PP), which surprisingly retains much of the performance of LLM-PP. This presents a cost-effective alternative for resource-intensive performance estimation. Specifically, for Neural Architecture Search (NAS), we introduce a Hybrid-Search algorithm (HS-NAS) employing LLM-Distill-PP for the initial search stages and reverting to the baseline predictor later. HS-NAS performs similarly to SoTA NAS, reducing search hours by approximately 50{\%}, and in some cases, improving latency, GFLOPs, and model size. The code can be found at: https://github.com/UBC-NLP/llmas.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="jawahar-etal-2024-llm">
<titleInfo>
<title>LLM Performance Predictors are good initializers for Architecture Search</title>
</titleInfo>
<name type="personal">
<namePart type="given">Ganesh</namePart>
<namePart type="family">Jawahar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Muhammad</namePart>
<namePart type="family">Abdul-Mageed</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Laks</namePart>
<namePart type="family">Lakshmanan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dujian</namePart>
<namePart type="family">Ding</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-08</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: ACL 2024</title>
</titleInfo>
<name type="personal">
<namePart type="given">Lun-Wei</namePart>
<namePart type="family">Ku</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Andre</namePart>
<namePart type="family">Martins</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Vivek</namePart>
<namePart type="family">Srikumar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Bangkok, Thailand</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>In this work, we utilize Large Language Models (LLMs) for a novel use case: constructing Performance Predictors (PP) that estimate the performance of specific deep neural network architectures on downstream tasks. We create PP prompts for LLMs, comprising (i) role descriptions, (ii) instructions for the LLM, (iii) hyperparameter definitions, and (iv) demonstrations presenting sample architectures with efficiency metrics and ‘training from scratch’ performance. In machine translation (MT) tasks, GPT-4 with our PP prompts (LLM-PP) achieves a SoTA mean absolute error and a slight degradation in rank correlation coefficient compared to baseline predictors. Additionally, we demonstrate that predictions from LLM-PP can be distilled to a compact regression model (LLM-Distill-PP), which surprisingly retains much of the performance of LLM-PP. This presents a cost-effective alternative for resource-intensive performance estimation. Specifically, for Neural Architecture Search (NAS), we introduce a Hybrid-Search algorithm (HS-NAS) employing LLM-Distill-PP for the initial search stages and reverting to the baseline predictor later. HS-NAS performs similarly to SoTA NAS, reducing search hours by approximately 50%, and in some cases, improving latency, GFLOPs, and model size. The code can be found at: https://github.com/UBC-NLP/llmas.</abstract>
<identifier type="citekey">jawahar-etal-2024-llm</identifier>
<identifier type="doi">10.18653/v1/2024.findings-acl.627</identifier>
<location>
<url>https://aclanthology.org/2024.findings-acl.627</url>
</location>
<part>
<date>2024-08</date>
<extent unit="page">
<start>10540</start>
<end>10560</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T LLM Performance Predictors are good initializers for Architecture Search
%A Jawahar, Ganesh
%A Abdul-Mageed, Muhammad
%A Lakshmanan, Laks
%A Ding, Dujian
%Y Ku, Lun-Wei
%Y Martins, Andre
%Y Srikumar, Vivek
%S Findings of the Association for Computational Linguistics: ACL 2024
%D 2024
%8 August
%I Association for Computational Linguistics
%C Bangkok, Thailand
%F jawahar-etal-2024-llm
%X In this work, we utilize Large Language Models (LLMs) for a novel use case: constructing Performance Predictors (PP) that estimate the performance of specific deep neural network architectures on downstream tasks. We create PP prompts for LLMs, comprising (i) role descriptions, (ii) instructions for the LLM, (iii) hyperparameter definitions, and (iv) demonstrations presenting sample architectures with efficiency metrics and ‘training from scratch’ performance. In machine translation (MT) tasks, GPT-4 with our PP prompts (LLM-PP) achieves a SoTA mean absolute error and a slight degradation in rank correlation coefficient compared to baseline predictors. Additionally, we demonstrate that predictions from LLM-PP can be distilled to a compact regression model (LLM-Distill-PP), which surprisingly retains much of the performance of LLM-PP. This presents a cost-effective alternative for resource-intensive performance estimation. Specifically, for Neural Architecture Search (NAS), we introduce a Hybrid-Search algorithm (HS-NAS) employing LLM-Distill-PP for the initial search stages and reverting to the baseline predictor later. HS-NAS performs similarly to SoTA NAS, reducing search hours by approximately 50%, and in some cases, improving latency, GFLOPs, and model size. The code can be found at: https://github.com/UBC-NLP/llmas.
%R 10.18653/v1/2024.findings-acl.627
%U https://aclanthology.org/2024.findings-acl.627
%U https://doi.org/10.18653/v1/2024.findings-acl.627
%P 10540-10560
Markdown (Informal)
[LLM Performance Predictors are good initializers for Architecture Search](https://aclanthology.org/2024.findings-acl.627) (Jawahar et al., Findings 2024)
ACL