{
  "_meta": {
    "title": "benchr verified model figures",
    "purpose": "SINGLE SOURCE OF TRUTH for every model number on benchr. Charts, slides, and article pages must read figures from this file, not from prose. Each figure was confirmed against the provider's OWN official source on the verifiedDate shown.",
    "rules": [
      "Official provider source only (the company's own announcement, docs, pricing page, or model card). Third-party blogs/aggregators/leaderboards never count as a source.",
      "null means the figure could NOT be confirmed from an official source. null is never a guess — it is an honest gap. Read the matching note.",
      "Do not copy numbers from benchr article pages; they may contain errors. This file is the independent, verified record.",
      "Prices are USD per 1,000,000 tokens unless a field name says otherwise (e.g. perImage)."
    ],
    "verifiedDate": "2026-05-31",
    "reVerifiedDate": "2026-06-03",
    "reVerificationNote": "2026-06-03 re-verification (benchr tool-suite fix pass): Claude Opus 4.8/4.7 and Sonnet 4.6 context windows re-confirmed as 1,000,000 tokens on platform.claude.com (200K only on Microsoft Foundry); Opus 4.8/4.7 max output 128K, Sonnet 4.6 max output 64K. DeepSeek V4-Pro pricing re-read DIRECTLY off api-docs.deepseek.com on 2026-06-03 = $0.435 in / $0.87 out (cache-hit $0.003625) — the post-promo level held; the $1.74/$3.48 figure circulating in third-party snippets is a stale pre-promo cache and is NOT what the live page shows. Mistral Medium 3.5 added (Modified MIT, open-weight, 256K, $1.50/$7.50). Phi-4 added (MIT, 14B, 16K, self-host, Dec 2024).",
    "sourceLegend": "Each model has a `sources` map. A figure's source = the URL in `sources` for that figure's category (pricing, benchmarks, context, release, license). All figures share the model-level `verifiedDate` unless a per-figure note says otherwise.",
    "currency": "USD",
    "note": "This file is distinct from assets/data/models.json, which powers the benchr tools with editorial 0-100 capability scores and latency estimates that are NOT official figures. Verified official numbers live here."
  },
  "models": [
    {
      "id": "claude-opus-4-8",
      "name": "Claude Opus 4.8",
      "provider": "Anthropic",
      "apiModelId": "claude-opus-4-8",
      "license": "proprietary",
      "releaseDate": "2026-05-28",
      "context": { "windowTokens": 1000000, "maxOutputTokens": 128000, "maxOutputTokensBeta": 300000 },
      "pricing": {
        "inputPerM": 5.00,
        "outputPerM": 25.00,
        "fastModeInputPerM": 10.00,
        "fastModeOutputPerM": 50.00,
        "batchInputPerM": 2.50,
        "batchOutputPerM": 12.50,
        "cachedInputPerM": null
      },
      "benchmarks": {
        "SWE-bench Verified": 88.6,
        "SWE-bench Pro": 69.2,
        "SWE-bench Multilingual": 84.4,
        "SWE-bench Multimodal": 38.4,
        "Terminal-Bench 2.1": 74.6,
        "GPQA Diamond": 93.6,
        "OSWorld-Verified": 83.4,
        "BrowseComp (single-agent)": 84.3,
        "Humanity's Last Exam (no tools)": 49.8,
        "GDPval-AA (Elo)": 1890,
        "ARC-AGI-2": null
      },
      "sources": {
        "release": "https://www.anthropic.com/news/claude-opus-4-8",
        "pricing": "https://platform.claude.com/docs/en/about-claude/pricing",
        "context": "https://platform.claude.com/docs/en/about-claude/models/overview",
        "benchmarks": "https://www.anthropic.com/claude-opus-4-8-system-card"
      },
      "verifiedDate": "2026-06-03",
      "notes": "Standard price unchanged from Opus 4.7. The $10/$50 figure is the optional fast-mode rate (~2.5x output speed), NOT the base price; base is $5/$25. ARC-AGI-2 is not in the official Opus 4.8 headline summary table, so it is null (do not state one). New Opus 4.7+ tokenizer can use up to ~35% more tokens per the pricing page."
    },
    {
      "id": "claude-opus-4-7",
      "name": "Claude Opus 4.7",
      "provider": "Anthropic",
      "apiModelId": "claude-opus-4-7",
      "license": "proprietary",
      "releaseDate": "2026-04-16",
      "context": { "windowTokens": 1000000, "maxOutputTokens": 128000, "maxOutputTokensBeta": 300000 },
      "pricing": {
        "inputPerM": 5.00,
        "outputPerM": 25.00,
        "fastModeInputPerM": 30.00,
        "fastModeOutputPerM": 150.00,
        "batchInputPerM": 2.50,
        "batchOutputPerM": 12.50,
        "cachedInputPerM": null
      },
      "benchmarks": {
        "SWE-bench Verified": 87.6,
        "SWE-bench Pro": 64.3,
        "SWE-bench Multilingual": 80.5,
        "SWE-bench Multimodal": 34.5,
        "Terminal-Bench 2.0": 69.4,
        "GPQA Diamond": 94.2
      },
      "sources": {
        "release": "https://www.anthropic.com/news/claude-opus-4-7",
        "pricing": "https://platform.claude.com/docs/en/about-claude/pricing",
        "context": "https://platform.claude.com/docs/en/about-claude/models/overview",
        "benchmarks": "https://www.anthropic.com/claude-opus-4-7-system-card"
      },
      "verifiedDate": "2026-06-03",
      "notes": "Fast mode on 4.7 was $30/$150 (3x more expensive than Opus 4.8's $10/$50 fast mode). Opus 4.8 later restated Opus 4.7's OSWorld score upward (~82.x) after a test-harness fix."
    },
    {
      "id": "claude-sonnet-4-6",
      "name": "Claude Sonnet 4.6",
      "provider": "Anthropic",
      "apiModelId": "claude-sonnet-4-6",
      "license": "proprietary",
      "releaseDate": "2026-02-17",
      "context": { "windowTokens": 1000000, "maxOutputTokens": 64000, "maxOutputTokensBeta": 300000 },
      "pricing": {
        "inputPerM": 3.00,
        "outputPerM": 15.00,
        "batchInputPerM": 1.50,
        "batchOutputPerM": 7.50,
        "cachedInputPerM": null
      },
      "benchmarks": {
        "SWE-bench Verified": 79.6,
        "SWE-bench Multilingual": 75.9,
        "Terminal-Bench 2.0": 59.1,
        "OSWorld-Verified": 72.5,
        "GPQA Diamond": 89.9,
        "MMMLU": 89.3,
        "AIME 2025 (no tools)": 95.6,
        "Humanity's Last Exam (no tools)": 33.2,
        "Humanity's Last Exam (with tools)": 49.0,
        "ARC-AGI-2": 58.3,
        "tau2-bench Telecom": 97.9,
        "tau2-bench Retail": 91.7,
        "GDPval-AA (Elo)": 1633
      },
      "sources": {
        "release": "https://www.anthropic.com/news/claude-sonnet-4-6",
        "pricing": "https://platform.claude.com/docs/en/about-claude/pricing",
        "context": "https://platform.claude.com/docs/en/about-claude/models/overview",
        "benchmarks": "https://www.anthropic.com/news/claude-sonnet-4-6"
      },
      "verifiedDate": "2026-06-03",
      "notes": "No fast-mode tier (fast mode is Opus-only). Benchmark values read from the official Sonnet 4.6 System Card (Table 2.1.A). SWE-bench Verified 79.6% averaged over 25 trials (80.2% with a stated prompt modification). Anthropic flags possible AIME 2025 contamination. Anthropic reports MMMLU, not plain MMLU."
    },
    {
      "id": "claude-haiku-4-5",
      "name": "Claude Haiku 4.5",
      "provider": "Anthropic",
      "apiModelId": "claude-haiku-4-5",
      "license": "proprietary",
      "releaseDate": "2025-10-15",
      "context": { "windowTokens": 200000, "maxOutputTokens": 64000, "maxOutputTokensBeta": null },
      "pricing": {
        "inputPerM": 1.00,
        "outputPerM": 5.00,
        "batchInputPerM": 0.50,
        "batchOutputPerM": 2.50,
        "cachedInputPerM": null
      },
      "benchmarks": {
        "SWE-bench Verified": 73.3,
        "Terminal-Bench (no thinking)": 40.21,
        "Terminal-Bench (32K thinking)": 41.75,
        "GPQA Diamond": null,
        "OSWorld-Verified": null,
        "AIME 2025": null,
        "MMMLU": null
      },
      "sources": {
        "release": "https://www.anthropic.com/news/claude-haiku-4-5",
        "pricing": "https://platform.claude.com/docs/en/about-claude/pricing",
        "context": "https://platform.claude.com/docs/en/about-claude/models/overview",
        "benchmarks": "https://www.anthropic.com/news/claude-haiku-4-5"
      },
      "verifiedDate": "2026-05-31",
      "notes": "Pinned snapshot claude-haiku-4-5-20251001. SWE-bench Verified 73.3% (avg of 50 trials, 128K thinking budget) and Terminal-Bench are the only headline numbers Anthropic publishes as readable official text; GPQA/OSWorld/AIME/MMMLU appear only inside a launch-page image, so they are null (not guessed)."
    },
    {
      "id": "claude-mythos-preview",
      "name": "Claude Mythos Preview",
      "provider": "Anthropic",
      "apiModelId": null,
      "license": "proprietary",
      "releaseDate": null,
      "context": { "windowTokens": 1000000, "maxOutputTokens": null, "maxOutputTokensBeta": null },
      "pricing": {
        "inputPerM": 25.00,
        "outputPerM": 125.00,
        "note": "Stated price for approved Project Glasswing participants only; not generally purchasable."
      },
      "benchmarks": {
        "SWE-bench Pro": 77.8
      },
      "availability": "RESTRICTED — research preview under Project Glasswing. Not generally available. Invitation-only (12 named launch partners + 40+ critical-infrastructure orgs). No self-serve sign-up; no public API model id.",
      "sources": {
        "release": "https://www.anthropic.com/glasswing",
        "pricing": "https://www.anthropic.com/glasswing",
        "context": "https://platform.claude.com/docs/en/about-claude/pricing",
        "benchmarks": "https://www.anthropic.com/glasswing"
      },
      "verifiedDate": "2026-05-31",
      "notes": "Anthropic: 'We do not plan to make Claude Mythos Preview generally available.' Purpose-built for defensive cybersecurity / vulnerability research. No GA date. 1M context confirmed only via the pricing page's long-context list. Frame any benchr page as restricted/not-for-public."
    },
    {
      "id": "gpt-5",
      "name": "GPT-5",
      "provider": "OpenAI",
      "apiModelId": "gpt-5",
      "license": "proprietary",
      "releaseDate": "2025-08-07",
      "context": { "windowTokens": 400000, "maxOutputTokens": 128000 },
      "pricing": {
        "inputPerM": 1.25,
        "outputPerM": 10.00,
        "cachedInputPerM": 0.125
      },
      "benchmarks": {
        "SWE-bench Verified": 74.9,
        "AIME 2025": null,
        "GPQA Diamond": null,
        "HealthBench Hard": 46.2
      },
      "sources": {
        "release": "https://deploymentsafety.openai.com/gpt-5",
        "pricing": "https://developers.openai.com/api/docs/models/gpt-5",
        "context": "https://developers.openai.com/api/docs/models/gpt-5",
        "benchmarks": "https://cdn.openai.com/gpt-5-system-card.pdf"
      },
      "verifiedDate": "2026-05-31",
      "notes": "SWE-bench Verified 74.9% is officially OpenAI's launch-blog figure (verbosity=medium), confirmed via the system card PDF which cites it (p.36). AIME 2025 (~94.6%) and GPQA Diamond (~88.4%) are widely attributed to the launch page openai.com/index/introducing-gpt-5, which blocks automated fetchers (403); they could NOT be re-read from an accessible official source, so they are null. The 88.4% figure may refer to GPT-5 pro, not base GPT-5. OpenAI docs now label GPT-5 the previous model."
    },
    {
      "id": "gpt-5-5",
      "name": "GPT-5.5",
      "provider": "OpenAI",
      "apiModelId": "gpt-5.5",
      "license": "proprietary",
      "releaseDate": "2026-04-23",
      "context": { "windowTokens": 1050000, "maxOutputTokens": 128000 },
      "pricing": {
        "inputPerM": 5.00,
        "outputPerM": 30.00,
        "cachedInputPerM": 0.50,
        "proInputPerM": null,
        "proOutputPerM": null,
        "extendedContextSurcharge": "For sessions >272K input tokens: 2x input, 1.5x output (standard/batch/flex)."
      },
      "benchmarks": {
        "HealthBench (length-adjusted)": 56.5,
        "HealthBench Professional": 51.8,
        "SWE-bench Verified": null,
        "SWE-bench Pro": null,
        "Terminal-Bench 2.0": null,
        "OSWorld-Verified": null
      },
      "sources": {
        "release": "https://deploymentsafety.openai.com/gpt-5-5",
        "pricing": "https://developers.openai.com/api/docs/models/gpt-5.5",
        "context": "https://developers.openai.com/api/docs/models/gpt-5.5",
        "benchmarks": "https://deploymentsafety.openai.com/gpt-5-5"
      },
      "verifiedDate": "2026-05-31",
      "notes": "Flagship GPT-5.5 (and GPT-5.5 Pro) announced Apr 23, 2026 (do not confuse with GPT-5.5 Instant, the ChatGPT default released May 5, 2026). Context is 1,050,000 (not a round 1M). HealthBench figures are the only official system-card benchmarks. GPT-5.5-pro pricing (~$30/$180) and the coding/agent numbers circulating (Terminal-Bench 82.7%, OSWorld 78.7%, SWE-bench Pro 58.6%) are vendor-relayed via the 403'd launch page and could NOT be read from an accessible official source — null until confirmable. The extended-context >272K surcharge IS officially confirmed on the model doc page."
    },
    {
      "id": "gpt-5-mini",
      "name": "GPT-5 Mini",
      "provider": "OpenAI",
      "apiModelId": "gpt-5-mini",
      "license": "proprietary",
      "releaseDate": "2025-08-07",
      "context": { "windowTokens": 400000, "maxOutputTokens": 128000 },
      "pricing": {
        "inputPerM": 0.25,
        "outputPerM": 2.00,
        "cachedInputPerM": 0.025
      },
      "benchmarks": {},
      "sources": {
        "release": "https://developers.openai.com/api/docs/models/gpt-5-mini",
        "pricing": "https://developers.openai.com/api/docs/models/gpt-5-mini",
        "context": "https://developers.openai.com/api/docs/models/gpt-5-mini"
      },
      "verifiedDate": "2026-05-31",
      "notes": "Official price is $0.25/$2.00 (NOT $0.50/$4.00 as in the legacy benchr verified-table). Release date inferred from snapshot id gpt-5-mini-2025-08-07. No headline benchmark published on the model doc page (empty benchmarks object, not null figures)."
    },
    {
      "id": "gpt-image-2",
      "name": "ChatGPT Images 2.0 (GPT Image 2)",
      "provider": "OpenAI",
      "apiModelId": "gpt-image-2",
      "license": "proprietary",
      "releaseDate": "2026-04-21",
      "context": { "windowTokens": null, "maxOutputTokens": null },
      "pricing": {
        "imageInputPerM": 8.00,
        "cachedImageInputPerM": 2.00,
        "imageOutputPerM": 30.00,
        "textInputPerM": 5.00,
        "batchImageInputPerM": 4.00,
        "batchImageOutputPerM": 15.00,
        "perImage": null
      },
      "benchmarks": {},
      "sources": {
        "release": "https://developers.openai.com/api/docs/models/gpt-image-2",
        "pricing": "https://developers.openai.com/api/docs/pricing"
      },
      "verifiedDate": "2026-05-31",
      "notes": "OpenAI prices this model by TOKENS, not per image. There is NO official per-image list price; per-image figures like ~$0.006 (low) / ~$0.053 (medium) / ~$0.211 (high) for 1024x1024 are third-party calculator estimates, so perImage is null. Release date inferred from snapshot gpt-image-2-2026-04-21. Supersedes GPT Image 1."
    },
    {
      "id": "gemini-3-pro",
      "name": "Gemini 3 Pro",
      "provider": "Google",
      "apiModelId": "gemini-3-pro-preview",
      "license": "proprietary",
      "releaseDate": "2025-11-18",
      "status": "DEPRECATED — shut down 2026-03-09; replaced by Gemini 3.1 Pro.",
      "context": { "windowTokens": 1048576, "maxOutputTokens": 65536 },
      "pricing": {
        "inputPerM": null,
        "outputPerM": null,
        "note": "No longer on the live official pricing page (model retired). When live it was $2/$12 (<=200K) and $4/$18 (>200K)."
      },
      "benchmarks": {
        "SWE-bench Verified": 76.2,
        "GPQA Diamond": 91.9,
        "Humanity's Last Exam (no tools)": 37.5,
        "Terminal-Bench 2.0": 54.2,
        "LMArena (Elo)": 1501,
        "SimpleQA Verified": 72.1,
        "MMMU-Pro": 81.0,
        "MathArena Apex": 23.4
      },
      "sources": {
        "release": "https://blog.google/products-and-platforms/products/gemini/gemini-3/",
        "status": "https://ai.google.dev/gemini-api/docs/deprecations",
        "context": "https://ai.google.dev/gemini-api/docs/models/gemini-3-pro-preview",
        "benchmarks": "https://blog.google/products-and-platforms/products/gemini/gemini-3/"
      },
      "verifiedDate": "2026-05-31",
      "notes": "Live price is null because the retired model is no longer on the official pricing page (historical $2/$12 kept only as a note). Benchmarks from the Gemini 3 launch blog. Max output 65,536 on the model spec page (an older guide page said 64,000)."
    },
    {
      "id": "gemini-3-1-pro",
      "name": "Gemini 3.1 Pro",
      "provider": "Google",
      "apiModelId": "gemini-3.1-pro-preview",
      "license": "proprietary",
      "releaseDate": "2026-02-19",
      "status": "Preview (GA coming soon).",
      "context": { "windowTokens": 1000000, "maxOutputTokens": 64000 },
      "pricing": {
        "inputPerM": 2.00,
        "outputPerM": 12.00,
        "inputAbove200kPerM": 4.00,
        "outputAbove200kPerM": 18.00,
        "batchInputPerM": 1.00,
        "batchOutputPerM": 6.00,
        "batchInputAbove200kPerM": 2.00,
        "batchOutputAbove200kPerM": 9.00,
        "freeApiTier": false
      },
      "benchmarks": {
        "ARC-AGI-2": 77.1,
        "GPQA Diamond": 94.3,
        "Humanity's Last Exam (with tools)": 51.4,
        "MMMU-Pro": 80.5,
        "SWE-bench Verified": 80.6,
        "MMMLU": 92.6
      },
      "sources": {
        "release": "https://blog.google/innovation-and-ai/models-and-research/gemini-models/gemini-3-1-pro/",
        "pricing": "https://ai.google.dev/gemini-api/docs/pricing",
        "context": "https://deepmind.google/models/model-cards/gemini-3-1-pro/",
        "benchmarks": "https://deepmind.google/models/model-cards/gemini-3-1-pro/"
      },
      "verifiedDate": "2026-05-31",
      "notes": "Tiered pricing: the >200K-token tier doubles input ($4) and raises output ($18). Output price includes thinking tokens. No free API tier (free trial in AI Studio UI only)."
    },
    {
      "id": "gemini-3-5-flash",
      "name": "Gemini 3.5 Flash",
      "provider": "Google",
      "apiModelId": "gemini-3.5-flash",
      "license": "proprietary",
      "releaseDate": "2026-05-19",
      "status": "GA (stable).",
      "context": { "windowTokens": 1048576, "maxOutputTokens": 65536 },
      "pricing": {
        "inputPerM": 1.50,
        "outputPerM": 9.00,
        "batchInputPerM": 0.75,
        "batchOutputPerM": 4.50,
        "freeApiTier": true,
        "note": "Flat rate — no >200K context tier."
      },
      "benchmarks": {
        "Terminal-Bench 2.1": 76.2,
        "GDPval-AA (Elo)": 1656,
        "MCP Atlas": 83.6,
        "CharXiv Reasoning": 84.2,
        "SWE-bench Verified": null,
        "GPQA Diamond": null
      },
      "sources": {
        "release": "https://ai.google.dev/gemini-api/docs/deprecations",
        "pricing": "https://ai.google.dev/gemini-api/docs/pricing",
        "context": "https://ai.google.dev/gemini-api/docs/models/gemini-3.5-flash",
        "benchmarks": "https://blog.google/innovation-and-ai/models-and-research/gemini-models/gemini-3-5/"
      },
      "verifiedDate": "2026-05-31",
      "notes": "Released at Google I/O (May 19, 2026), GA. Flat $1.50/$9 (not context-tiered). Free API tier available. The launch blog publishes only the four agentic/coding/multimodal benchmarks above; no official SWE-bench/GPQA for 3.5 Flash, so those are null."
    },
    {
      "id": "grok-4-3",
      "name": "Grok 4.3",
      "provider": "xAI",
      "apiModelId": "grok-4.3",
      "license": "proprietary",
      "releaseDate": null,
      "context": { "windowTokens": 1000000, "maxOutputTokens": null },
      "pricing": {
        "inputPerM": 1.25,
        "outputPerM": 2.50,
        "cachedInputPerM": 0.20
      },
      "benchmarks": {},
      "sources": {
        "pricing": "https://docs.x.ai/developers/models/grok-4.3",
        "context": "https://docs.x.ai/developers/models/grok-4.3"
      },
      "verifiedDate": "2026-05-31",
      "notes": "xAI officially publishes price ($1.25/$2.50, cached $0.20), 1M context, and the model id only. It publishes NO official release date, max output, or numeric benchmarks for Grok 4.3 (only a qualitative non-hallucination claim) — those are null. Figures like 'Intelligence Index 53' or 'tau2-bench Telecom 98%' are third-party (Artificial Analysis / leaderboards), not official xAI numbers."
    },
    {
      "id": "deepseek-v4-flash",
      "name": "DeepSeek-V4-Flash",
      "provider": "DeepSeek",
      "apiModelId": "deepseek-v4-flash",
      "license": "MIT",
      "releaseDate": "2026-04-24",
      "params": { "total": "284B", "active": "13B" },
      "context": { "windowTokens": 1000000, "maxOutputTokens": 384000 },
      "pricing": {
        "inputPerM": 0.14,
        "outputPerM": 0.28,
        "cacheHitInputPerM": 0.0028,
        "selfHost": true
      },
      "benchmarks": {
        "SWE-bench Verified": 79.0,
        "GPQA Diamond": 88.1,
        "LiveCodeBench": 91.6,
        "MMLU-Pro (Think Max)": 86.2,
        "HMMT 2026 Feb": 94.8,
        "MRCR 1M": 78.7
      },
      "sources": {
        "release": "https://api-docs.deepseek.com/news/news260424",
        "pricing": "https://api-docs.deepseek.com/quick_start/pricing",
        "license": "https://huggingface.co/deepseek-ai/DeepSeek-V4-Flash",
        "benchmarks": "https://huggingface.co/deepseek-ai/DeepSeek-V4-Flash"
      },
      "verifiedDate": "2026-06-03",
      "notes": "Flash pricing is standard (no promo). Cache-hit input reduced to $0.0028 (1/10) effective 2026-04-26. Open weights, MIT. Benchmarks are DeepSeek's own model-card figures (Instruct / Think Max mode)."
    },
    {
      "id": "deepseek-v4-pro",
      "name": "DeepSeek-V4-Pro",
      "provider": "DeepSeek",
      "apiModelId": "deepseek-v4-pro",
      "license": "MIT",
      "releaseDate": "2026-04-24",
      "params": { "total": "1.6T", "active": "49B" },
      "context": { "windowTokens": 1000000, "maxOutputTokens": 384000 },
      "pricing": {
        "inputPerM": 0.435,
        "outputPerM": 0.87,
        "cacheHitInputPerM": 0.003625,
        "selfHost": true,
        "priceChangeNote": "These rates are PERMANENT, not a temporary low. A 75% promo ends 2026-05-31 15:59 UTC, after which DeepSeek officially sets the price to 1/4 of the original — the SAME level as the promo. The price does NOT rise."
      },
      "benchmarks": {
        "SWE-bench Verified": 80.6,
        "SWE-bench Pro": 55.4,
        "GPQA Diamond": 90.1,
        "LiveCodeBench": 93.5,
        "Terminal-Bench 2.0": 67.9,
        "MMLU-Pro (Max)": 87.5,
        "MRCR 1M": 83.5,
        "Codeforces (rating)": 3206
      },
      "sources": {
        "release": "https://api-docs.deepseek.com/news/news260424",
        "pricing": "https://api-docs.deepseek.com/quick_start/pricing",
        "license": "https://huggingface.co/deepseek-ai/DeepSeek-V4-Pro",
        "benchmarks": "https://huggingface.co/deepseek-ai/DeepSeek-V4-Pro"
      },
      "verifiedDate": "2026-06-03",
      "notes": "RE-VERIFIED DIRECTLY on the official pricing page on 2026-06-03 (after the 2026-05-31 promo end): live price = $0.435 in / $0.87 out, cache-hit $0.003625 — the post-promo level held. Do NOT record a post-promo rise to $1.74/$3.48; that figure appears only in stale third-party search snippets, not on the live DeepSeek page. SWE-bench 80.6% is DeepSeek's own vendor-reported figure. Open weights, MIT."
    },
    {
      "id": "kimi-k2-6",
      "name": "Kimi K2.6",
      "provider": "Moonshot AI",
      "apiModelId": "kimi-k2.6",
      "license": "Modified MIT",
      "releaseDate": null,
      "params": { "total": "1T", "active": "32B", "experts": "384 (8 active/token)" },
      "context": { "windowTokens": 262144, "maxOutputTokens": null },
      "pricing": {
        "inputPerM": 0.95,
        "outputPerM": 4.00,
        "cacheHitInputPerM": 0.16,
        "selfHost": true
      },
      "benchmarks": {
        "SWE-bench Verified": 80.2,
        "SWE-bench Multilingual": 76.7,
        "SWE-bench Pro": 58.6,
        "Terminal-Bench 2.0": 66.7,
        "LiveCodeBench v6": 89.6,
        "AIME 2026": 96.4,
        "GPQA Diamond": 90.5,
        "Humanity's Last Exam (with tools)": 54.0,
        "OSWorld-Verified": 73.1
      },
      "sources": {
        "release": null,
        "pricing": "https://platform.kimi.ai/docs/pricing/chat-k26",
        "license": "https://huggingface.co/moonshotai/Kimi-K2.6/blob/main/LICENSE",
        "benchmarks": "https://huggingface.co/moonshotai/Kimi-K2.6"
      },
      "verifiedDate": "2026-05-31",
      "notes": "Official release date not stated on any official Moonshot page (third-party says Apr 20, 2026), so releaseDate is null. Context 262,144 (256K) confirmed on both pricing page and model card. Input $0.95 is cache-miss; cache-hit $0.16. Model card body: 1T total / 32B active (the org listing page rounds to 1.1T). Benchmarks are Moonshot's own model-card figures."
    },
    {
      "id": "mistral-large-3",
      "name": "Mistral Large 3",
      "provider": "Mistral AI",
      "apiModelId": "mistral-large-2512",
      "license": "Apache-2.0",
      "releaseDate": "2025-12-02",
      "params": { "total": "675B", "active": "41B" },
      "context": { "windowTokens": 256000, "maxOutputTokens": null },
      "pricing": {
        "inputPerM": 0.50,
        "outputPerM": 1.50,
        "cacheHitInputPerM": null,
        "selfHost": true
      },
      "benchmarks": {},
      "sources": {
        "release": "https://docs.mistral.ai/models/mistral-large-3-25-12",
        "pricing": "https://docs.mistral.ai/models/mistral-large-3-25-12",
        "license": "https://mistral.ai/news/mistral-3/",
        "benchmarks": "https://mistral.ai/news/mistral-3/"
      },
      "verifiedDate": "2026-05-31",
      "notes": "Open weights, Apache-2.0. Mistral's announcement gives only relative/leaderboard claims for Large 3, no discrete official per-benchmark scores (the ~85% AIME figure on the page belongs to a smaller reasoning variant, NOT Large 3) — so benchmarks is intentionally empty, not guessed. Cache-hit price not published."
    },
    {
      "id": "mistral-medium-3-5",
      "name": "Mistral Medium 3.5",
      "provider": "Mistral AI",
      "apiModelId": "mistral-medium-3-5",
      "license": "Modified MIT",
      "releaseDate": null,
      "params": { "total": "128B", "active": "128B", "type": "dense" },
      "context": { "windowTokens": 256000, "maxOutputTokens": null },
      "pricing": {
        "inputPerM": 1.50,
        "outputPerM": 7.50,
        "cacheHitInputPerM": null,
        "selfHost": true
      },
      "benchmarks": {},
      "sources": {
        "release": "https://mistral.ai/news/vibe-remote-agents-mistral-medium-3-5/",
        "pricing": "https://docs.mistral.ai/models/model-cards/mistral-medium-3-5-26-04",
        "license": "https://docs.mistral.ai/models/model-cards/mistral-medium-3-5-26-04",
        "context": "https://docs.mistral.ai/models/model-cards/mistral-medium-3-5-26-04"
      },
      "verifiedDate": "2026-06-03",
      "notes": "Verified 2026-06-03 against the official Mistral model card: Modified MIT license, open-weight (self-hostable on ~4 GPUs) AND offered as a hosted API, dense 128B, 256K context, $1.50/$7.50 per 1M. Official release date and max output tokens not stated on the card (null). No discrete official per-benchmark scores published — benchmarks intentionally empty."
    },
    {
      "id": "qwen-3-6-27b",
      "name": "Qwen3.6-27B (dense)",
      "provider": "Alibaba (Qwen)",
      "apiModelId": "Qwen/Qwen3.6-27B",
      "license": "Apache-2.0",
      "releaseDate": "2026-04-22",
      "params": { "total": "27B", "active": "27B", "type": "dense" },
      "context": { "windowTokens": 262144, "windowTokensExtended": 1010000, "maxOutputTokens": null },
      "pricing": { "selfHost": true, "inputPerM": null, "outputPerM": null },
      "benchmarks": {
        "SWE-bench Verified": 77.2,
        "SWE-bench Pro": 53.5,
        "Terminal-Bench 2.0": 59.3,
        "MMLU-Pro": 86.2,
        "GPQA Diamond": 87.8,
        "AIME 2026": 94.1,
        "MMMU": 82.9
      },
      "sources": {
        "release": "https://huggingface.co/Qwen/Qwen3.6-27B",
        "license": "https://huggingface.co/Qwen/Qwen3.6-27B",
        "benchmarks": "https://huggingface.co/Qwen/Qwen3.6-27B"
      },
      "verifiedDate": "2026-05-31",
      "notes": "Open weight, Apache-2.0, self-host (no per-token list price). 262,144 native context, extensible ~1M via YaRN. Benchmarks from the official HF model card."
    },
    {
      "id": "qwen-3-6-35b-a3b",
      "name": "Qwen3.6-35B-A3B (MoE)",
      "provider": "Alibaba (Qwen)",
      "apiModelId": "Qwen/Qwen3.6-35B-A3B",
      "license": "Apache-2.0",
      "releaseDate": "2026-04-16",
      "params": { "total": "35B", "active": "3B", "experts": "256 (8 routed + 1 shared)" },
      "context": { "windowTokens": 262144, "windowTokensExtended": 1010000, "maxOutputTokens": null },
      "pricing": { "selfHost": true, "inputPerM": null, "outputPerM": null },
      "benchmarks": {
        "SWE-bench Verified": 73.4,
        "SWE-bench Multilingual": 67.2,
        "Terminal-Bench 2.0": 51.5,
        "MMLU-Pro": 85.2,
        "AIME 2026": 92.7,
        "GPQA Diamond": 86.0,
        "MMMU": 81.7
      },
      "sources": {
        "release": "https://huggingface.co/Qwen/Qwen3.6-35B-A3B",
        "license": "https://huggingface.co/Qwen/Qwen3.6-35B-A3B",
        "benchmarks": "https://huggingface.co/Qwen/Qwen3.6-35B-A3B"
      },
      "verifiedDate": "2026-05-31",
      "notes": "Open weight, Apache-2.0, self-host. NOTE: a 'Qwen3.6-Plus' hosted/cloud flagship could NOT be confirmed on any official Alibaba/Qwen page (Model Studio still lists Qwen3.5-Plus); treat Qwen3.6-Plus pricing/specs as unverified. Only the two open-weight variants (27B, 35B-A3B) are officially confirmed."
    },
    {
      "id": "llama-4-scout",
      "name": "Llama 4 Scout",
      "provider": "Meta",
      "apiModelId": "meta-llama/Llama-4-Scout-17B-16E",
      "license": "Llama 4 Community License",
      "releaseDate": "2025-04-05",
      "params": { "total": "109B", "active": "17B", "experts": "16" },
      "context": { "windowTokens": 10000000, "maxOutputTokens": null },
      "pricing": { "selfHost": true, "inputPerM": null, "outputPerM": null },
      "benchmarks": {
        "MMLU-Pro (0-shot)": 74.3,
        "GPQA Diamond": 57.2,
        "MMMU": 73.4,
        "MathVista": 73.7,
        "LiveCodeBench": 32.8
      },
      "sources": {
        "release": "https://ai.meta.com/blog/llama-4-multimodal-intelligence/",
        "license": "https://www.llama.com/llama4/license/",
        "context": "https://www.llama.com/docs/model-cards-and-prompt-formats/llama4/",
        "benchmarks": "https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E"
      },
      "verifiedDate": "2026-05-31",
      "notes": "Open weight; $0 to self-host (Meta sells no API). Community License: >700M MAU needs a separate Meta license; not OSI-approved. 10M-token context. Benchmarks from Meta's official instruction-tuned model card."
    },
    {
      "id": "llama-4-maverick",
      "name": "Llama 4 Maverick",
      "provider": "Meta",
      "apiModelId": "meta-llama/Llama-4-Maverick-17B-128E-Instruct",
      "license": "Llama 4 Community License",
      "releaseDate": "2025-04-05",
      "params": { "total": "400B", "active": "17B", "experts": "128" },
      "context": { "windowTokens": 1000000, "maxOutputTokens": null },
      "pricing": { "selfHost": true, "inputPerM": null, "outputPerM": null },
      "benchmarks": {
        "MMLU-Pro (0-shot)": 80.5,
        "GPQA Diamond": 69.8,
        "LiveCodeBench": 43.4,
        "MGSM": 92.3
      },
      "sources": {
        "release": "https://ai.meta.com/blog/llama-4-multimodal-intelligence/",
        "license": "https://www.llama.com/llama4/license/",
        "context": "https://www.llama.com/docs/model-cards-and-prompt-formats/llama4/",
        "benchmarks": "https://huggingface.co/meta-llama/Llama-4-Maverick-17B-128E-Instruct"
      },
      "verifiedDate": "2026-05-31",
      "notes": "Open weight; $0 to self-host. 1M-token context, 128 experts. Llama 4 Behemoth (288B active / ~2T total) was only ever previewed as 'still training' and was never released — do not list specs for it as a usable model."
    },
    {
      "id": "phi-4",
      "name": "Phi-4",
      "provider": "Microsoft",
      "apiModelId": "microsoft/phi-4",
      "license": "MIT",
      "releaseDate": "2024-12-12",
      "params": { "total": "14B", "active": "14B", "type": "dense" },
      "context": { "windowTokens": 16000, "maxOutputTokens": null },
      "pricing": { "selfHost": true, "inputPerM": null, "outputPerM": null },
      "benchmarks": {
        "GPQA Diamond": 56.1,
        "MMLU": 84.8,
        "HumanEval": 82.6,
        "MATH": 80.4
      },
      "sources": {
        "release": "https://huggingface.co/microsoft/phi-4",
        "license": "https://huggingface.co/microsoft/phi-4",
        "context": "https://huggingface.co/microsoft/phi-4",
        "benchmarks": "https://www.microsoft.com/en-us/research/publication/phi-4-technical-report/"
      },
      "verifiedDate": "2026-06-03",
      "notes": "Verified 2026-06-03: MIT-licensed, 14B dense, 16K context, self-host only (no Microsoft per-token API; available on Azure AI Foundry + Hugging Face). Released 2024-12-12. Benchmarks from the Phi-4 technical report / model card."
    }
  ]
}