{
  "title": "LAIasBench: An Agent-Centric Benchmark for Radiology Finding-to-Report Generation",
  "authors": [
    "Natan Paraiso Ribeiro",
    "Petrus Paraiso Ribeiro",
    "Francisco Akira",
    "Stephanie Alba Herrera",
    "Raquel Moreno"
  ],
  "corresponding_author_email": "natan@laudos.ai",
  "suggested_primary_category": "cs.CL",
  "suggested_cross_lists": [
    "cs.AI",
    "cs.LG"
  ],
  "license_to_choose_on_arxiv": "Author decision required before submission. Prefer the most restrictive arXiv option compatible with author goals; arXiv license choice is irrevocable. Repository source remains proprietary source-available under LICENSE.",
  "abstract": "Radiology report generation is an end-to-end text-generation system problem. The clinically relevant question is not whether a language model can write fluent prose, see an image, or make a de novo imaging diagnosis, but whether a complete reporting workflow can convert provided findings into a faithful, structured, safe radiology report. We introduce LAIasBench, a CLI-first benchmark and evaluation harness for radiology finding-to-report generation in Brazilian Portuguese, with cross-locale support for American English. LAIasBench evaluates executable reporting systems with locked tasks, frozen prediction artifacts, deterministic clinical safety gates, LLM-adjudicated report-quality scoring, and score-first reporting with strict PASS gates as an error metric rather than diagnostic accuracy. Product agents, custom agents, mini-agent scaffolds, and raw model baselines are separated into comparable tracks so external companies, laboratories, and independent teams can submit results without exposing proprietary implementation details. A private 40-case daily regression suite is sampled from a synthetic 65,812-report source workflow derived from approximately 400 extractive seed reports, while public reference suites provide reproducible external testing. The framework records suite hashes, run metadata, canary tokens, cost and latency signals, deterministic gate failures, judge scores, benchmark-card actionability, and integrity checks that recompute summaries before leaderboard publication. Completed 49-case mini-agent baseline rows are reported separately from the 49-case LAIas product-agent reference result to show score spread without collapsing product and wrapper tracks. Current labels are heuristically derived and privacy-filtered; the benchmark supports engineering regression and reproducible reference testing, not radiologist-adjudicated ground truth or clinical deployment validation claims.",
  "comments": "Preprint source package prepared from repository branch codex/real-laiasbench-daily-eval. The private daily suite is capped at 40 cases and should not be exposed in public pages.",
  "public_artifact_id": "LAIASBENCH-PUBLIC-2026-05-02-BF78-A309",
  "telemetry_note": "Public site telemetry records page-view and artifact-click events with hashed IP/user-agent fields. The artifact ID is intentionally public to help identify copied pages or redistributed packages.",
  "submission_blockers": [
    "Author must log in or register on arXiv.",
    "arXiv may require endorsement for first-time submission or a new category.",
    "Author must choose the final arXiv license and validate metadata before final submission."
  ],
  "public_artifacts": {
    "preprint_pdf": "https://laibench.laudos.ai/laibench-preprint.pdf",
    "arxiv_source_zip": "https://laibench.laudos.ai/arxiv-laiasbench-source.zip",
    "repository_license": "Proprietary source-available; see LICENSE"
  }
}
