My research focuses on making machine learning more useful, more
interpretable, and able to learn and interact from humans. This helps
users sift through decades of documents; discover when individuals lie,
reframe, or change the topic in a conversation; or to compete against
humans in games that are based in natural language.
@inproceedings{Sung:Gor:Fleisig:Mondal:Boyd-Graber-2025,
Title = {ADVSCORE: A Metric for the Evaluation and Creation of Adversarial Benchmarks},
Author = {Yoo Yeon Sung and Maharshi Gor and Eve Fleisig and Ishani Mondal and Jordan Lee Boyd-Graber},
Journal = {North American Association for Computational Linguistics},
Year = {2025},
Url = {http://cs.umd.edu/~jbg//docs/2025_naacl_advscore.pdf},
}
This was one of ten papers selected as an Outstanding Paper at NAACL 2025
Accessible Abstract: Adversarial datasets should validate AI robustness by presenting samples that humans handle well but models struggle with. However, as models advance, these datasets risk becoming obsolete. Assessing whether a dataset remains adversarial is challenging due to the absence of a standardized metric for adversarialness. To address this, we introduce AdvScore, a human-grounded evaluation metric that quantifies a dataset's adversarial nature by accounting for the differing abilities of models and humans while also identifying low-quality examples.
@article{Gu:Wongkamjan:Kummerfeld:Peskoff:May:Boyd-Graber-2025,
Title = {Should I Trust You? Detecting Deception in Negotiations using Counterfactual RL},
Author = {Feng Gu and Wichayaporn Wongkamjan and Jonathan K. Kummerfeld and Denis Peskoff and Jonathan May and Jordan Boyd-Graber},
Journal = {Findings of the Association for Computational Linguistics},
Year = {2025},
Url = {http://cs.umd.edu/~jbg//docs/2025_acl_ctrld.pdf},
}
Accessible Abstract: When determining when an offer sounds "too good to be true", it helps to consider what the person sending the message has to gain. When we provide this information to classifiers tasked with determining if a message is deceptive in the online game of Diplomacy, it dramatically improves ability to detect deception.
@inproceedings{Balepur:Padmakumar:Yang:Feng:Rudinger:Boyd-Graber-2025,
Title = {Whose Boat Does it Float? Improving Personalization in Preference Tuning via Inferred User Personas},
Author = {Nishant Balepur and Vishakh Padmakumar and Fumeng Yang and Shi Feng and Rachel Rudinger and Jordan Lee Boyd-Graber},
Booktitle = {Association for Computational Linguistics},
Location = {Vienna, Austria},
Year = {2025},
Url = {http://cs.umd.edu/~jbg//docs/2025_acl_boat.pdf},
}
Accessible Abstract: Language models are optimized to learn which responses you prefer, but they don't learn why you preferred a particular response. This limits their ability to tailor to personalized requests (e.g., "What should I eat for dinner? I'm vegetarian"), so we introduce a simple fix: have models infer personas that explain why users could prefer responses. We show training on these inferred personas leads to responses that are significantly more personalized for user needs.
@inproceedings{Balepur:Rudinger:Boyd-Graber-2025,
Title = {Which of These Best Describes Multiple Choice Evaluation with LLMs? A) Forced B) Flawed C) Fixable D) All of the Above},
Author = {Nishant Balepur and Rachel Rudinger and Jordan Boyd-Graber},
Booktitle = {Association for Computational Linguistics},
Location = {Vienna, Austria},
Year = {2025},
Url = {http://cs.umd.edu/~jbg//docs/2025_acl_mcqa_bad.pdf},
}
Accessible Abstract: Most people dislike taking multiple-choice tests, so why are they the default way we evaluate NLP systems? This position paper argues that, despite its simplicity and popularity, multiple-choice evaluation is flawed, both in its format and the datasets it relies on. Drawing from educational testing theory, we propose practical fixes for these issues, helping us build evaluations that better test knowledge and reflect how humans use NLP systems.
@inproceedings{Hoyle:Calvo-Bartolom\'e:Boyd-Graber:Resnik-2025,
Title = {ProxAnn: Use-Oriented Evaluations of Topic Models and Document Clustering},
Author = {Alexander Hoyle and Lorena Calvo-Bartolom\'e and Jordan Lee Boyd-Graber and Philip Resnik},
Booktitle = {Association for Computational Linguistics},
Location = {Vienna, Austria},
Year = {2025},
Url = {http://cs.umd.edu/~jbg//docs/2025_acl_proxann.pdf},
}
Accessible Abstract: Topic models are tools to help people navigate large document collections. However, testing whether a topic model is good or not is notoriously hard, as it's subjective and requires asking real people about whether the outputs make sense. We show that you can ask a language model to recreate the answers of humans, correlating better with ground truth than previous evaluations.
@inproceedings{Srikanth:Rudinger:Boyd-Graber-2025,
Author = {Neha Srikanth and Rachel Rudinger and Jordan Lee Boyd-Graber},
Title = {No Questions are Stupid and but some are Poorly Posed: Understanding Poorly-Posed Information-Seeking Questions},
Booktitle = {Association for Computational Linguistics},
Location = {Vienna, Austria},
Year = {2025},
Url = {http://cs.umd.edu/~jbg//docs/2025_acl_badq.pdf},
}
Accessible Abstract: Often, the questions users ask search engines or chatbots aren't perfect: they have errors, are vague, or lack context. Humans are able to deftly navigate these issues, but computers still struggle. We analyze the differences in how humans and computers repair imperfect questions to suggest how to improve AI's question answering abilities.
@inproceedings{Li:Calvo-Bartolom\'e:Hoyle:Xu:Stephens:Fung:Dima:Boyd-Graber-2025,
Author = {Zongxia Li and Lorena Calvo-Bartolom\'e and Alexander Miserlis Hoyle and Paiheng Xu and Daniel Kofi Stephens and Juan Francisco Fung and Alden Dima and Jordan Boyd-Graber},
Title = {LLMs Struggle to Describe the Haystack without Human Help: A Social Science-Inspired Evaluation of Topic Models},
Booktitle = {Association for Computational Linguistics},
Location = {Vienna, Austria},
Year = {2025},
Url = {http://cs.umd.edu/~jbg//docs/2025_acl_bass.pdf},
}
@inproceedings{Sung:Fleisig:Hope:Upadhyay:Boyd-Graber-2025,
Title = {GRACE: A Granular Benchmark for Evaluating Model Calibration against Human Calibration},
Author = {Yoo Yeon Sung and Eve Fleisig and Yu Hope and Ishan Upadhyay and Jordan Boyd-Graber},
Booktitle = {Association for Computational Linguistics},
Year = {2025},
Location = {Vienna, Austria},
Url = {http://cs.umd.edu/~jbg//docs/2025_acl_grace.pdf},
}
Accessible Abstract: As AI use becomes more common, it's important to measure not just whether the systems are correct but whether they know when they're incorrect. We propose a new metric to measure this mismatch between correctness and confidence, compare computer ability with human ability, and show that computers have a long way to go before they're well-calibrated.
@inproceedings{Balepur:Gu:Ravichander:Feng:Boyd-Graber:Rudinger-2025,
Title = {Reverse Question Answering: Can an LLM Write a Question so Hard (or Bad) that it Can't Answer?},
Author = {Nishant Balepur and Feng Gu and Abhilasha Ravichander and Shi Feng and Jordan Boyd-Graber and Rachel Rudinger},
Booktitle = {Annual Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics},
Year = {2025},
Location = {Albuquerque},
Url = {http://cs.umd.edu/~jbg//docs/2025_naacl_reverseqa.pdf},
}
Accessible Abstract: Language models like ChatGPT are pretty good at answering questions (e.g. "What is 12 * 12?"), but we show they can surprisingly struggle when asked to do the reverse task: generating questions for answers (e.g. "Give me a question with the answer 144"). We study when these errors happen, what might be causing them, and how they can be addressed.
Feng Gu, Wichayaporn Wongkamjan, Jonathan K. Kummerfeld, Denis Peskoff, Jonathan May, and Jordan Boyd-Graber. Personalized Help for Optimizing Low-Skilled Users' Strategy. Annual Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics, 2025. [Bibtex]
@inproceedings{Gu:Wongkamjan:Kummerfeld:Peskoff:May:Boyd-Graber-2025,
Title = {Personalized Help for Optimizing Low-Skilled Users' Strategy},
Author = {Feng Gu and Wichayaporn Wongkamjan and Jonathan K. Kummerfeld and Denis Peskoff and Jonathan May and Jordan Boyd-Graber},
Booktitle = {Annual Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics},
Year = {2025},
Location = {Albuquerque},
Url = {http://cs.umd.edu/~jbg//docs/2024_arr_chiron-advisor.pdf},
}
Accessible Abstract: AIs can beat humans in game environments; however, how helpful those agents are to human remains understudied. We augment CICERO, a natural language agent that demonstrates superhuman performance in Diplomacy, to generate both move and message advice based on player intentions. A dozen Diplomacy games with novice and experienced players, with varying advice settings, show that some of the generated advice is beneficial. It helps novices compete with experienced players and in some instances even surpass them. The mere presence of advice can be advantageous, even if players do not follow it.
@inproceedings{Balepur:Siu:Lipka:Dernoncourt:Sun:Boyd-Graber:Mathur-2025,
Title = {MoDS: Moderating a Mixture of Document Speakers to Summarize Debatable Queries in Document Collections},
Author = {Nishant Balepur and Alexa Siu and Nedim Lipka and Franck Dernoncourt and Tong Sun and Jordan Lee Boyd-Graber and Puneet Mathur},
Booktitle = {Annual Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics},
Year = {2025},
Location = {Albuquerque},
Url = {http://cs.umd.edu/~jbg//docs/2025_naacl_mods.pdf},
}
Accessible Abstract: When you ask ChatGPT for advice on questions with multiple perspectives (e.g. "Is pineapple good on pizza?"), you likely want a response that fairly represents all viewpoints. We formulate this task, collect a dataset to test it, and develop MoDS—a system where multiple ChatGPT's debate like a panel discussion—to generate balanced answers for questions based on multiple sources.
Benjamin Börschinger, Jordan Boyd-Graber, Christian Buck, Jannis Bulian, Massimiliano Ciaramita, Michelle Chen Huebscher, Wojciech Gajewski, Yannic Kilcher, Rodrigo Nogueira, and Lierni Sestorain Saralegu. Meta Answering for Machine Reading. ArXiv, 2020. [Preprint] [Bibtex]
@article{B\"orschinger:Boyd-Graber:Buck:Bulian:Ciaramita:Huebscher:Gajewski:Kilcher:Nogueira:Saralegu-2020,
Title = {Meta Answering for Machine Reading},
Author = {Benjamin B\"orschinger and Jordan Boyd-Graber and Christian Buck and Jannis Bulian and Massimiliano Ciaramita and Michelle Chen Huebscher and Wojciech Gajewski and Yannic Kilcher and Rodrigo Nogueira and Lierni Sestorain Saralegu},
Journal = {ArXiv},
Year = {2020},
Url = {https://arxiv.org/abs/1911.04156},
}
@article{Rodriguez:Feng:Iyyer:He:Boyd-Graber-2020,
Title = {Quizbowl: The Case for Incremental Question Answering},
Author = {Pedro Rodriguez and Shi Feng and Mohit Iyyer and He He and Jordan Boyd-Graber},
Journal = {ArXiv},
Year = {2020},
Url = {https://arxiv.org/abs/1904.04792},
}