{"results":[{"id":"confidence-unreliable","text":"LLM self-assessed confidence does not track accuracy. Confirmed across 4 models: Sonnet r=0.198, Opus r=-0.182 (worse than random), Flash r=0.219, Pro r=0.121. Answer and confidence come from the same process — same structural flaw as human overconfidence (Kahneman)","truth_value":"IN","justification_count":0,"dependent_count":2,"challenges":[],"last_reviewed":null,"review_result":null},{"id":"eem-replaces-confidence","text":"EEM replaces 'am I sure?' with 'is this justified?' — shifting from unreliable confidence to auditable justification chains","truth_value":"IN","justification_count":1,"dependent_count":0,"challenges":[],"last_reviewed":null,"review_result":null}],"count":2,"limit":20,"offset":0}