
Project supported through NSF Award 2229885
Project site: trails.umd.edu
TRAILS brings together research on how people and AI systems work together when tasks involve language, information access, and decision making. This page is the local hub for our group's TRAILS-related contributions, including collaborative question answering, speech translation, and evaluations of when AI systems help or mislead people.
For the broader project overview, resources, and collaborators beyond this page, see the main TRAILS website.
<< back to top
@inproceedings{Han:Balepur:Boyd-Graber:Carpuat-2026,
Title = {Measuring User's Mental Models of Speech Translation in Human-MT Collaboration},
Author = {HyoJung Han and Nishant Balepur and Jordan Lee Boyd-Graber and Marine Carpuat},
Booktitle = {Association for Computational Linguistics},
Year = {2026},
}
@article{Gor:Sung:Hou:Fleisig:Ying:Zhou:Boyd-Graber-2026,
Title = {AI, Take the Wheel: What Drives Delegation and Trust in Human-Computer Cooperative Question Answering?},
Author = {Maharshi Gor and Yoo Yeon Sung and Yu Hou and Eve Fleisig and Zhu Irene Ying and Tianyi Zhou and Jordan Lee Boyd-Graber},
Journal = {Findings of the Association for Computational Linguistics},
Year = {2026},
}
@article{Kabir:Kurdydyk:Palnitkar:Dorn:Ahmed:Boyd-Graber-2026,
Title = {AUDITA: A New Dataset to Audit Humans or AI is Better at Audio QA},
Author = {Tasnim Kabir and Dmytro Kurdydyk and Aadi Palnitkar and Liam Dorn and Ahmed Haj Ahmed and Jordan Lee Boyd-Graber},
Journal = {Findings of the Association for Computational Linguistics},
Year = {2026},
}
@inproceedings{Li:Calvo-Bartolom\'e:Hoyle:Xu:Stephens:Fung:Dima:Boyd-Graber-2025,
Author = {Zongxia Li and Lorena Calvo-Bartolom\'e and Alexander Miserlis Hoyle and Paiheng Xu and Daniel Kofi Stephens and Juan Francisco Fung and Alden Dima and Jordan Boyd-Graber},
Title = {LLMs Struggle to Describe the Haystack without Human Help: A Social Science-Inspired Evaluation of Topic Models},
Booktitle = {Association for Computational Linguistics},
Location = {Vienna, Austria},
Year = {2025},
Url = {http://cs.umd.edu/~jbg//docs/2025_acl_bass.pdf},
}
Accessible Abstract: Understanding large document collections has been the domain of old fashioned models called topic models for decades. There are new models based on LLMs that claim to be better... are they? We propose a new evaluation based on how much people learn from interacting with models to categorize a dataset to compare traditional and LLM models: traditional models are not bad, new models hallucinate, and a human in the loop model that we call BASS has the best outcomes.
@inproceedings{Hoyle:Calvo-Bartolome:Boyd-Graber:Resnik-2025,
Title = {ProxAnn: Use-Oriented Evaluations of Topic Models and Document Clustering},
Author = {Alexander Hoyle and Lorena Calvo-Bartolom\'{e} and Jordan Lee Boyd-Graber and Philip Resnik},
Booktitle = {Association for Computational Linguistics},
Location = {Vienna, Austria},
Year = {2025},
Url = {http://cs.umd.edu/~jbg//docs/2025_acl_proxann.pdf},
}
Accessible Abstract: Topic models are tools to help people navigate large document collections. However, testing whether a topic model is good or not is notoriously hard, as it's subjective and requires asking real people about whether the outputs make sense. We show that you can ask a language model to recreate the answers of humans, correlating better with ground truth than previous evaluations.
Any opinions, findings, and conclusions or recommendations expressed in this material are those of the researchers and do not necessarily reflect the views of the sponsor.