{"id":"https://openalex.org/W4398192305","doi":"https://doi.org/10.48550/arxiv.2405.11751","title":"Asymptotic theory of in-context learning by linear attention","display_name":"Asymptotic theory of in-context learning by linear attention","publication_year":2024,"publication_date":"2024-05-19","ids":{"openalex":"https://openalex.org/W4398192305","doi":"https://doi.org/10.48550/arxiv.2405.11751"},"language":"en","primary_location":{"is_oa":true,"landing_page_url":"https://arxiv.org/abs/2405.11751","pdf_url":"https://arxiv.org/pdf/2405.11751","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_indexed_in_scopus":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":["Cornell University"],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false},"type":"preprint","type_crossref":"posted-content","indexed_in":["arxiv"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/2405.11751","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5038111389","display_name":"Yue M. Lu","orcid":"https://orcid.org/0000-0002-5174-2595"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Lu, Yue M.","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5098749021","display_name":"Mary I. Letey","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Letey, Mary I.","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5063822685","display_name":"Jacob A. Zavatone-Veth","orcid":"https://orcid.org/0000-0002-4060-1738"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zavatone-Veth, Jacob A.","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5007951093","display_name":"Anindita Maiti","orcid":"https://orcid.org/0000-0002-4712-6626"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Maiti, Anindita","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5023195984","display_name":"Cengiz Pehlevan","orcid":"https://orcid.org/0000-0001-9767-6063"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Pehlevan, Cengiz","raw_affiliation_strings":[],"affiliations":[]}],"institution_assertions":[],"countries_distinct_count":0,"institutions_distinct_count":0,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.0,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":0,"max":83},"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"primary_topic":{"id":"https://openalex.org/T10320","display_name":"Neural Networks and Applications","score":0.9431,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10320","display_name":"Neural Networks and Applications","score":0.9431,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/asymptotic-analysis","display_name":"Asymptotic Analysis","score":0.50469816}],"concepts":[{"id":"https://openalex.org/C2779343474","wikidata":"https://www.wikidata.org/wiki/Q3109175","display_name":"Context (archaeology)","level":2,"score":0.6078732},{"id":"https://openalex.org/C119047807","wikidata":"https://www.wikidata.org/wiki/Q752718","display_name":"Asymptotic analysis","level":2,"score":0.50469816},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.42010418},{"id":"https://openalex.org/C149782125","wikidata":"https://www.wikidata.org/wiki/Q160039","display_name":"Econometrics","level":1,"score":0.41306692},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.37458292},{"id":"https://openalex.org/C28826006","wikidata":"https://www.wikidata.org/wiki/Q33521","display_name":"Applied mathematics","level":1,"score":0.33668858},{"id":"https://openalex.org/C15744967","wikidata":"https://www.wikidata.org/wiki/Q9418","display_name":"Psychology","level":0,"score":0.325876},{"id":"https://openalex.org/C205649164","wikidata":"https://www.wikidata.org/wiki/Q1071","display_name":"Geography","level":0,"score":0.14394283},{"id":"https://openalex.org/C166957645","wikidata":"https://www.wikidata.org/wiki/Q23498","display_name":"Archaeology","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"is_oa":true,"landing_page_url":"https://arxiv.org/abs/2405.11751","pdf_url":"https://arxiv.org/pdf/2405.11751","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_indexed_in_scopus":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":["Cornell University"],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false}],"best_oa_location":{"is_oa":true,"landing_page_url":"https://arxiv.org/abs/2405.11751","pdf_url":"https://arxiv.org/pdf/2405.11751","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_indexed_in_scopus":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":["Cornell University"],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false},"sustainable_development_goals":[],"grants":[],"datasets":[],"versions":[],"referenced_works_count":0,"referenced_works":[],"related_works":["https://openalex.org/W4391375266","https://openalex.org/W3125884652","https://openalex.org/W3123593911","https://openalex.org/W3121988082","https://openalex.org/W3021432157","https://openalex.org/W2405030091","https://openalex.org/W2129428590","https://openalex.org/W1995897514","https://openalex.org/W1979597421","https://openalex.org/W177160858"],"abstract_inverted_index":{"Transformers":[0],"have":[1],"a":[2,35,61,75,91,126,136],"remarkable":[3],"ability":[4],"to":[5,64,101],"learn":[6],"and":[7,50,106,116,134,145,174,194],"execute":[8],"tasks":[9],"based":[10],"on":[11],"examples":[12,121],"provided":[13],"within":[14],"the":[15,43,87,96,103,113,117,140,151,155,165,177],"input":[16],"itself,":[17],"without":[18],"explicit":[19],"prior":[20],"training.":[21],"It":[22],"has":[23],"been":[24],"argued":[25],"that":[26],"this":[27],"capability,":[28],"known":[29],"as":[30],"in-context":[31,172],"learning":[32,88,128,173],"(ICL),":[33],"is":[34,99],"cornerstone":[36],"of":[37,72,74,119,160,179],"Transformers'":[38],"success,":[39],"yet":[40],"questions":[41,66],"about":[42],"necessary":[44],"sample":[45],"complexity,":[46],"pretraining":[47,107,120,132],"task":[48,78,108,147],"diversity,":[49],"context":[51,104],"length":[52,105],"for":[53,86],"successful":[54],"ICL":[55,73],"remain":[56],"unresolved.":[57],"Here,":[58],"we":[59],"provide":[60],"precise":[62],"answer":[63],"these":[65],"in":[67,90,139,164],"an":[68],"exactly":[69],"solvable":[70],"model":[71,156],"linear":[76,80,192],"regression":[77],"by":[79],"attention.":[81],"We":[82,124],"derive":[83],"sharp":[84],"asymptotics":[85],"curve":[89,129],"phenomenologically-rich":[92],"scaling":[93],"regime":[94],"where":[95],"token":[97,114],"dimension":[98],"taken":[100],"infinity;":[102],"diversity":[109,148,153,167],"scale":[110],"proportionally":[111],"with":[112,130,190],"dimension;":[115],"number":[118],"scales":[122],"quadratically.":[123],"demonstrate":[125],"double-descent":[127],"increasing":[131],"examples,":[133],"uncover":[135],"phase":[137],"transition":[138],"model's":[141],"behavior":[142],"between":[143],"low":[144,152],"high":[146,166],"regimes:":[149],"In":[150],"regime,":[154,168],"tends":[157],"toward":[158],"memorization":[159],"training":[161],"tasks,":[162],"whereas":[163],"it":[169],"achieves":[170],"genuine":[171],"generalization":[175],"beyond":[176],"scope":[178],"pretrained":[180],"tasks.":[181],"These":[182],"theoretical":[183],"insights":[184],"are":[185],"empirically":[186],"validated":[187],"through":[188],"experiments":[189],"both":[191],"attention":[193],"full":[195],"nonlinear":[196],"Transformer":[197],"architectures.":[198]},"cited_by_api_url":"https://api.openalex.org/works?filter=cites:W4398192305","counts_by_year":[],"updated_date":"2025-01-19T16:19:32.243934","created_date":"2024-05-22"}