{
  "synthetic": true,
  "description": "Ground-truth key fields for the workshop OCR companion docs. Each document has 5 tier variants: clean (baseline) and tier1..tier4 (increasing degradation). Participants attempt to extract these fields from the tier N PDFs; accuracy is scored against this file.",
  "documents": [
    {
      "doc_id": "nsf_award_OAC-2415678",
      "doc_type": "NSF Award Notice",
      "sponsor": "NSF",
      "pages": 4,
      "clean_pdf": "workshop_ocr/clean/nsf_award_notice_OAC-2415678.pdf",
      "tiered_pdfs": {
        "tier1": "workshop_ocr/tier1_structural/nsf_award_notice_OAC-2415678.pdf",
        "tier2": "workshop_ocr/tier2_scan/nsf_award_notice_OAC-2415678.pdf",
        "tier3": "workshop_ocr/tier3_authentic/nsf_award_notice_OAC-2415678.pdf",
        "tier4": "workshop_ocr/tier4_layout/nsf_award_notice_OAC-2415678.pdf"
      },
      "ground_truth_fields": {
        "award_number_fain": "OAC-2415678",
        "amendment_number": "000",
        "award_instrument": "Continuing Grant",
        "award_date": "2024-08-05",
        "period_of_performance_start": "2024-09-01",
        "period_of_performance_end": "2028-08-31",
        "project_title": "Trustworthy campus-scale assistants for research lifecycle intelligence",
        "funding_opportunity": "NSF 24-512 Cyberinfrastructure for Sustained Scientific Innovation (CSSI)",
        "assistance_listing": "47.070",
        "amount_obligated_this_amendment": 462500.00,
        "total_intended_award_amount": 1850000.00,
        "cost_share_amount": 0.00,
        "principal_investigator": "Malik T. Ashford",
        "pi_email": "mashford@synthetic.uidaho.edu",
        "pi_organization": "UNIVERSITY OF IDAHO",
        "recipient_legal_name": "REGENTS OF THE UNIVERSITY OF IDAHO (SYNTHETIC)",
        "recipient_uei": "SYN7QWYKRJH5",
        "recipient_address": "875 PERIMETER DR MOSCOW, ID 83844-9803",
        "managing_program_officer": "Dr. Amelia K. Voss",
        "managing_program_officer_email": "avoss@synthetic.nsf.gov",
        "managing_grants_official": "Priya R. Lakshmi",
        "awarding_official": "Jannele L. Gosey",
        "indirect_cost_rate_pct": 50.0,
        "indirect_cost_base": "Modified Total Direct Costs",
        "budget": {
          "total_salaries_and_wages": 1028000.00,
          "fringe_benefits": 308400.00,
          "equipment": 0.00,
          "domestic_travel": 49333.00,
          "international_travel": 0.00,
          "materials_supplies": 61667.00,
          "publication_costs": 8000.00,
          "computer_services": 24667.00,
          "total_direct_costs": 1233334.00,
          "indirect_costs": 616666.00,
          "total_direct_and_indirect": 1850000.00
        }
      }
    },
    {
      "doc_id": "subaward_SUB-24-001",
      "doc_type": "Subaward Agreement",
      "sponsor_upstream": "NIH (via University of Idaho prime recipient)",
      "pages": 3,
      "clean_pdf": "workshop_ocr/clean/subaward_agreement_SUB-24-001.pdf",
      "tiered_pdfs": {
        "tier1": "workshop_ocr/tier1_structural/subaward_agreement_SUB-24-001.pdf",
        "tier2": "workshop_ocr/tier2_scan/subaward_agreement_SUB-24-001.pdf",
        "tier3": "workshop_ocr/tier3_authentic/subaward_agreement_SUB-24-001.pdf",
        "tier4": "workshop_ocr/tier4_layout/subaward_agreement_SUB-24-001.pdf"
      },
      "ground_truth_fields": {
        "subaward_number": "SUB-24-001",
        "prime_award_number": "R01-AI-24-8002",
        "prime_sponsor": "NIH",
        "subaward_type": "Cost-Reimbursement Subaward",
        "executed_date": "2024-08-15",
        "period_of_performance_start": "2024-09-01",
        "period_of_performance_end": "2028-08-31",
        "total_obligated_amount": 185000.00,
        "f_and_a_rate_pct": 26.0,
        "f_and_a_type": "off_campus",
        "f_and_a_base": "MTDC",
        "prime_cfda": "93.855",
        "pass_through_entity": "Regents of the University of Idaho (Synthetic)",
        "pass_through_uei": "SYN7QWYKRJH5",
        "prime_pi": "R.J. MacReady, Ph.D.",
        "subrecipient_organization": "Outpost 31 Arctic Research Station",
        "subrecipient_uei": "O31ARST0031X",
        "subrecipient_ein": "99-0310310",
        "subrecipient_cage": "O31RS",
        "subrecipient_duns": "031031031031",
        "subrecipient_authorized_official": "Gary Bennings, Station Director",
        "subrecipient_pi": "Blair A. Copper",
        "subrecipient_pi_email": "bcopper@outpost31.org",
        "budget_periods": [
          {"period": 1, "start": "2024-09-01", "end": "2025-08-31", "amount": 46250.00},
          {"period": 2, "start": "2025-09-01", "end": "2026-08-31", "amount": 46250.00},
          {"period": 3, "start": "2026-09-01", "end": "2027-08-31", "amount": 46250.00},
          {"period": 4, "start": "2027-09-01", "end": "2028-08-31", "amount": 46250.00}
        ],
        "overall_risk_rating": "Medium",
        "risk_factor_geographic": "High",
        "risk_assessed_by": "Palmer, Nauls K. (Post-Award Financial Analyst)",
        "risk_assessment_date": "2024-07-20",
        "invoices": [
          {"invoice_number": "O31-INV-2025-001", "amount": 28750.00, "status": "Paid"},
          {"invoice_number": "O31-INV-2025-002", "amount": 17500.00, "status": "Under Review"}
        ]
      }
    },
    {
      "doc_id": "nih_noa_1R01AI248002-01",
      "doc_type": "NIH Notice of Award",
      "sponsor": "NIH/NIAID",
      "pages": 3,
      "clean_pdf": "workshop_ocr/clean/nih_notice_of_award_1R01AI248002-01.pdf",
      "tiered_pdfs": {
        "tier1": "workshop_ocr/tier1_structural/nih_notice_of_award_1R01AI248002-01.pdf",
        "tier2": "workshop_ocr/tier2_scan/nih_notice_of_award_1R01AI248002-01.pdf",
        "tier3": "workshop_ocr/tier3_authentic/nih_notice_of_award_1R01AI248002-01.pdf",
        "tier4": "workshop_ocr/tier4_layout/nih_notice_of_award_1R01AI248002-01.pdf"
      },
      "ground_truth_fields": {
        "sponsor_award_number": "1R01AI248002-01",
        "fain": "1R01AI248002",
        "federal_award_date": "2024-09-01",
        "award_type": "New",
        "action_type": "New Competing Award",
        "budget_period_start": "2024-09-01",
        "budget_period_end": "2025-08-31",
        "project_period_start": "2024-09-01",
        "project_period_end": "2028-08-31",
        "award_title": "Molecular mechanisms of cellular identity subversion in Thingium assimilans",
        "nih_activity_code": "R01",
        "funding_opportunity": "PAR-23-218",
        "assistance_listing_cfda": "93.855",
        "assistance_listing_name": "Allergy, Immunology, and Transplantation Research",
        "recipient_name": "REGENTS OF THE UNIVERSITY OF IDAHO (SYNTHETIC)",
        "recipient_address": "875 PERIMETER DR, MOSCOW, ID 83844-9803",
        "recipient_uei": "SYN7QWYKRJH5",
        "recipient_ein": "82-0290662",
        "congressional_district": "02",
        "pi_name": "R.J. MACREADY, PhD",
        "pi_email": "rmacready@synthetic.uidaho.edu",
        "authorized_official": "PEREGRIN A. TOOKFIELD",
        "grants_management_specialist": "SAMANTHA P. HENEGAR",
        "grants_management_email": "shenegar@synthetic.niaid.nih.gov",
        "program_official": "DR. NICOLAS J. DEMBOWSKI",
        "program_official_email": "ndembowski@synthetic.niaid.nih.gov",
        "year1_budget": {
          "salaries_and_wages": 245000.00,
          "fringe_benefits": 73500.00,
          "consultant_services": 0.00,
          "equipment": 0.00,
          "supplies": 28000.00,
          "travel": 14500.00,
          "other_expenses": 11750.00,
          "consortium_contractual_direct": 27250.00,
          "total_direct_costs": 400000.00,
          "indirect_costs_fa_50pct_mtdc": 200000.00,
          "total_approved_budget": 600000.00,
          "federal_share": 600000.00,
          "non_federal_share": 0.00
        },
        "future_year_commitments": [
          {"year": 2, "direct": 400000.00, "indirect": 200000.00, "total": 600000.00},
          {"year": 3, "direct": 400000.00, "indirect": 200000.00, "total": 600000.00},
          {"year": 4, "direct": 400000.00, "indirect": 200000.00, "total": 600000.00}
        ],
        "priority_score": 18,
        "percentile": 7,
        "clinical_trial_indicator": "No",
        "human_subjects_code": "10",
        "iacuc_approval_id": "UI-IACUC-2024-037",
        "approved_subaward_amount": 185000.00,
        "approved_subaward_id": "SUB-24-001"
      }
    }
  ],
  "degradation_tiers": {
    "clean": "Ground-truth PDF as generated. Full text layer. Naive pdftotext extraction recovers 100%.",
    "tier1_structural": "Image-only PDF (no text layer), 150 DPI, light skew, JPEG Q=55. Minimum change that defeats naive pdftotext — now requires actual OCR. All visual content is still crisp.",
    "tier2_scan": "Image-only. Multi-page bleed-through (mirrored faint text from adjacent page), Gaussian noise, salt-and-pepper speckle, mild blur, aggressive contrast reduction, 1-bit Floyd-Steinberg dithering (fax/photocopy aesthetic), horizontal banding streaks, 40-quality JPEG re-compression. Targets tesseract and generic OCR engines.",
    "tier3_authentic": "Builds on Tier 2. Adds red 'RECEIVED' stamp overlapping header text, yellow highlighter over budget figures (with semi-transparent multiply blend), handwritten signatures and margin annotations (Bradley Hand font + path-drawn squiggles), three-hole-punch cutouts on the left edge, coffee ring stain. Simulates the post-scan, post-review state that sponsor-generated PDFs typically arrive in.",
    "tier4_layout": "Image-only, Tier 1-level degradation + one page rotated 90 degrees + budget table replaced with a hand-drawn grid-line image in which ~45% of horizontal row separators are randomly omitted (creating a 'merged cell' ambiguity that defeats table-structure extraction)."
  }
}
