p-Hacking: Garden of Forking Paths

drag Y to set number of sub-tests K (1 = honest, 20 = pure fishing) · click to burst-sample · [R] reset

Both panels are generated from data where the null hypothesis is **true** — two groups, each drawn from $N (0, 1)$ , with $n = 12$ per arm. A two-sided Welch t-test is then run. Under $H_{0}$ a p-value is itself $Uniform (0, 1)$ , so the **left** histogram of an honest analyst running one pre-specified test is flat, and the long-run false-positive rate sits at the nominal $α = 0.05$ . The **right** panel shows the same null world, but here the investigator runs $K$ separate sub-analyses on each study (different subgroup splits, exclusions, transformations) and reports the **minimum** p-value. The minimum of $K$ i.i.d. uniforms is $Beta (1, K)$ , so the reported-p distribution collapses toward zero, and the false-positive rate inflates to $1 - (1 - α)^{K}$ — about $22%$ at $K = 5$ and $\approx 64%$ at $K = 20$ . **Drag mouseY** to scrub $K$ from 1 (matches honest) to 20 (pure p-hacking). This is the multiple-comparisons illusion documented in Simmons, Nelson & Simonsohn (2011), “False-Positive Psychology” — with enough researcher degrees of freedom, **any** dataset can be made to show 'significant' effects. Bins where $p < α$ are tinted yellow; the dashed line marks the uniform expectation $n_{studies} /20$ per bin.

idle

224 lines · vanilla

view source

// p-Hacking sandbox — "I just keep resampling until significant"
//
// Two side-by-side histograms of p-values, both generated from TRULY-NULL data
// (no real effect anywhere). Left panel: an honest analyst who runs ONE pre-
// specified t-test per study. Their p-value distribution is uniform on [0,1],
// because under H0 a p-value is itself a Uniform(0,1) random variable.
//
// Right panel: a "fishing" investigator who runs K sub-analyses per study
// (different subgroup splits, transformations, exclusions) and reports the
// MINIMUM p-value. The min of K i.i.d. Uniform(0,1) is Beta(1, K), heavily
// skewed toward 0, so the fishing histogram piles up on the left.
//
// User scrubs mouseY to set K from 1 (matches honest) to 20 (false-positive
// rate near 1 - (1-0.05)^20 ≈ 64%).

let W = 0, H = 0;
const BINS = 20;          // 20 equal-width bins on [0, 1] => width 0.05
const ALPHA = 0.05;
const N_PER_GROUP = 12;   // sample size per arm for each t-test
const STUDIES_PER_FRAME = 6; // how many studies we add to each hist per frame

let honestHist = new Int32Array(BINS);
let honestTotal = 0;
let honestSig = 0;        // count of honest p < 0.05

let fishHist = new Int32Array(BINS);
let fishTotal = 0;
let fishSig = 0;

let K = 5;                // number of sub-tests the investigator runs

// floating "p-value" droplets for visual feedback on the latest studies
let drops = [];

function init({ width, height }) {
  W = width; H = height;
}

// --- statistics helpers ---

// Box-Muller standard normal
function randn() {
  let u = Math.random(); if (u < 1e-12) u = 1e-12;
  const v = Math.random();
  return Math.sqrt(-2 * Math.log(u)) * Math.cos(2 * Math.PI * v);
}

// Welch / Student two-sample t-test, returns two-sided p-value.
// We use a high-accuracy normal approximation for the t-CDF, which is fine
// since with df ≈ 22 the tail mass at the levels we care about (around 0.05)
// is within ~1-2% of the exact Student-t value — plenty for a visual demo.
function tTestTwoSidedP(a, b) {
  const na = a.length, nb = b.length;
  let ma = 0, mb = 0;
  for (let i = 0; i < na; i++) ma += a[i];
  for (let i = 0; i < nb; i++) mb += b[i];
  ma /= na; mb /= nb;
  let va = 0, vb = 0;
  for (let i = 0; i < na; i++) { const d = a[i] - ma; va += d * d; }
  for (let i = 0; i < nb; i++) { const d = b[i] - mb; vb += d * d; }
  va /= (na - 1); vb /= (nb - 1);
  const se = Math.sqrt(va / na + vb / nb);
  if (se < 1e-15) return 1;
  const t = (ma - mb) / se;
  // two-sided normal approx: p = 2 * (1 - Phi(|t|))
  const p = 2 * (1 - phi(Math.abs(t)));
  return Math.min(1, Math.max(0, p));
}

// Standard normal CDF via erf approximation (Abramowitz & Stegun 7.1.26).
function phi(x) {
  return 0.5 * (1 + erf(x / Math.SQRT2));
}
function erf(x) {
  const sign = x < 0 ? -1 : 1;
  x = Math.abs(x);
  const a1 = 0.254829592, a2 = -0.284496736, a3 = 1.421413741;
  const a4 = -1.453152027, a5 = 1.061405429, p = 0.3275911;
  const t = 1 / (1 + p * x);
  const y = 1 - (((((a5 * t + a4) * t) + a3) * t + a2) * t + a1) * t * Math.exp(-x * x);
  return sign * y;
}

function sampleNullGroup(n) {
  const a = new Float64Array(n);
  for (let i = 0; i < n; i++) a[i] = randn(); // both arms ~ N(0,1) — H0 is true
  return a;
}

// Honest analyst: one pre-registered test.
function honestStudy() {
  const a = sampleNullGroup(N_PER_GROUP);
  const b = sampleNullGroup(N_PER_GROUP);
  return tTestTwoSidedP(a, b);
}

// Fishing investigator: run k sub-analyses on a fresh null sample and report
// the MINIMUM p-value. Each sub-analysis is a different "subgroup split" of
// independently-sampled null data — equivalent in distribution to k independent
// t-tests, which is the cleanest illustration of multiple comparisons.
function fishStudy(k) {
  let best = 1;
  for (let i = 0; i < k; i++) {
    const a = sampleNullGroup(N_PER_GROUP);
    const b = sampleNullGroup(N_PER_GROUP);
    const p = tTestTwoSidedP(a, b);
    if (p < best) best = p;
  }
  return best;
}

function binOf(p) {
  let bi = Math.floor(p * BINS);
  if (bi >= BINS) bi = BINS - 1;
  if (bi < 0) bi = 0;
  return bi;
}

function tick({ ctx, dt, width, height, input }) {
  if (width !== W || height !== H) { W = width; H = height; }

  // mouseY scrubs K from 1..20. When mouse is off-canvas, hold current K.
  const insideY = input.mouseY >= 0 && input.mouseY <= H;
  if (insideY) {
    const frac = 1 - input.mouseY / H;        // top = high K, bottom = K=1
    const newK = Math.max(1, Math.min(20, Math.round(1 + frac * 19)));
    K = newK;
  }

  // clicks burst a chunk of studies. consumeClicks() returns an array of clicks.
  const clickArr = input.consumeClicks ? input.consumeClicks() : null;
  const clicks = clickArr && clickArr.length ? clickArr.length : 0;
  if (input.justPressed && (input.justPressed("r") || input.justPressed("R"))) {
    honestHist = new Int32Array(BINS); honestTotal = 0; honestSig = 0;
    fishHist = new Int32Array(BINS); fishTotal = 0; fishSig = 0;
    drops = [];
  }

  const studies = STUDIES_PER_FRAME + clicks * 200;
  for (let i = 0; i < studies; i++) {
    const ph = honestStudy();
    honestHist[binOf(ph)]++;
    honestTotal++;
    if (ph < ALPHA) honestSig++;

    const pf = fishStudy(K);
    fishHist[binOf(pf)]++;
    fishTotal++;
    if (pf < ALPHA) fishSig++;

    if (i === 0) {
      drops.push({ side: 0, p: ph, age: 0 });
      drops.push({ side: 1, p: pf, age: 0 });
    }
  }

  // ---- layout ----
  ctx.fillStyle = "#0a0a10";
  ctx.fillRect(0, 0, W, H);

  const padL = 18, padR = 18, padTop = 56, padBot = 92;
  const gap = 14;
  const panelW = (W - padL - padR - gap) / 2;
  const panelH = H - padTop - padBot;
  const leftX = padL;
  const rightX = padL + panelW + gap;
  const topY = padTop;
  const botY = padTop + panelH;

  drawPanel(ctx, leftX, topY, panelW, panelH, honestHist, honestTotal,
    "Honest test (1 pre-specified t-test)",
    "rgba(120,200,255,0.75)", "rgba(120,200,255,1)");

  drawPanel(ctx, rightX, topY, panelW, panelH, fishHist, fishTotal,
    `Fishing expedition (min of K=${K} sub-tests)`,
    "rgba(255,120,140,0.75)", "rgba(255,120,140,1)");

  // animated drops falling into the histograms
  for (let i = drops.length - 1; i >= 0; i--) {
    const d = drops[i];
    d.age += dt;
    if (d.age > 0.9) { drops.splice(i, 1); continue; }
    const x0 = d.side === 0 ? leftX : rightX;
    const x = x0 + 10 + (d.p) * (panelW - 20);
    const y = topY + 6 + d.age * 80;
    if (y > botY - 4) continue;
    const a = 1 - d.age / 0.9;
    const color = d.p < ALPHA
      ? `rgba(255,210,80,${(0.9 * a).toFixed(3)})`
      : `rgba(255,255,255,${(0.6 * a).toFixed(3)})`;
    ctx.fillStyle = color;
    ctx.beginPath();
    ctx.arc(x, y, 2.5, 0, Math.PI * 2);
    ctx.fill();
  }

  // ---- title + HUD ----
  ctx.fillStyle = "#e8e8f0";
  ctx.font = "bold 16px monospace";
  ctx.fillText("p-Hacking: the Garden of Forking Paths", padL, 24);

  ctx.font = "11px monospace";
  ctx.fillStyle = "#9aa";
  ctx.fillText(`null data: two groups of N(0,1), n=${N_PER_GROUP} per arm   ·   ${(honestTotal + fishTotal).toLocaleString()} studies simulated`, padL, 42);

  // bottom stats
  const hRate = honestTotal > 0 ? honestSig / honestTotal : 0;
  const fRate = fishTotal > 0 ? fishSig / fishTotal : 0;

  // expected fishing FP rate when K independent sub-tests: 1 - (1 - alpha)^K
  const expectedFishRate = 1 - Math.pow(1 - ALPHA, K);

  ctx.font = "12px monospace";
  ctx.fillStyle = "#9cf";
  ctx.fillText(`honest false-positive rate (p<0.05):  ${(hRate * 100).toFixed(2)}%   (expected: 5.00%)`,
    padL, botY + 24);
  ctx.fillStyle = "#fca";
  ctx.fillText(`fishing false-positive rate (p<0.05): ${(fRate * 100).toFixed(2)}%   (expected: ${(expectedFishRate * 100).toFixed(2)}%)`,
    padL, botY + 42);

  // K-slider visualization on the right edge
  const sliderX = W - 14;
  ctx.fillStyle = "rgba(255,255,255,0.07)";
  ctx.fillRect(sliderX - 4, padTop, 6, panelH);
  const knobY = padTop + (1 - (K - 1) / 19) * panelH;
  ctx.fillStyle = "rgba(255,210,80,1)";
  ctx.beginPath();
  ctx.arc(sliderX - 1, knobY, 5, 0, Math.PI * 2);
  ctx.fill();
  ctx.fillStyle = "#fc8";
  ctx.font = "bold 11px monospace";
  ctx.fillText(`K=${K}`, sliderX - 38, knobY + 4);

  ctx.fillStyle = "#667";
  ctx.font = "10px monospace";
  ctx.fillText("drag mouseY to scrub K  ·  click to burst-sample  ·  [R] reset  ·  Simmons, Nelson & Simonsohn (2011)",
    padL, H - 8);
}

function drawPanel(ctx, x, y, w, h, hist, total, label, barColor, lineColor) {
  ctx.fillStyle = "#13131c";
  ctx.fillRect(x, y, w, h);

  const x0 = x + 10, x1 = x + w - 10;
  const y0 = y + 26, y1 = y + h - 26;
  const ww = x1 - x0, hh = y1 - y0;

  // Find max bin count for scaling
  let mx = 0;
  for (let i = 0; i < BINS; i++) if (hist[i] > mx) mx = hist[i];

  // Reference: under H0 uniform, each bin expects total/BINS counts.
  // Show that as a dashed reference line on the honest panel.
  const ref = total / BINS;

  // bars
  for (let i = 0; i < BINS; i++) {
    const bx = x0 + (i / BINS) * ww;
    const bw = ww / BINS - 1;
    const c = hist[i];
    const bh = mx > 0 ? (c / mx) * hh : 0;
    // tint bars where p < alpha
    const binMid = (i + 0.5) / BINS;
    if (binMid < ALPHA) {
      ctx.fillStyle = "rgba(255,210,80,0.95)";
    } else {
      ctx.fillStyle = barColor;
    }
    ctx.fillRect(bx, y1 - bh, bw, bh);
  }

  // dashed expected-uniform line
  if (total > 0 && mx > 0) {
    const refY = y1 - (ref / mx) * hh;
    ctx.strokeStyle = "rgba(255,255,255,0.35)";
    ctx.setLineDash([4, 4]);
    ctx.beginPath();
    ctx.moveTo(x0, refY);
    ctx.lineTo(x1, refY);
    ctx.stroke();
    ctx.setLineDash([]);
  }

  // alpha line at p = 0.05
  const aX = x0 + ALPHA * ww;
  ctx.strokeStyle = "rgba(255,210,80,0.6)";
  ctx.beginPath();
  ctx.moveTo(aX, y0);
  ctx.lineTo(aX, y1);
  ctx.stroke();
  ctx.fillStyle = "rgba(255,210,80,0.85)";
  ctx.font = "10px monospace";
  ctx.fillText("α=0.05", aX + 3, y0 + 10);

  // axis tick labels
  ctx.fillStyle = "#778";
  ctx.font = "10px monospace";
  ctx.fillText("0", x0 - 3, y1 + 12);
  ctx.fillText("0.5", x0 + ww / 2 - 6, y1 + 12);
  ctx.fillText("1", x1 - 4, y1 + 12);
  ctx.fillText("p-value", x0 + ww / 2 - 22, y1 + 22);

  // panel label
  ctx.fillStyle = "#e8e8f0";
  ctx.font = "bold 12px monospace";
  ctx.fillText(label, x + 8, y + 16);

  // n studies
  ctx.fillStyle = "#789";
  ctx.font = "10px monospace";
  ctx.fillText(`n=${total.toLocaleString()}`, x1 - 60, y + 16);
}

p-Hacking: Garden of Forking Paths

Comments (0)