{
  "updated": "2026-06-03",
  "verifiedDate": "2026-06-03",
  "note": "FACTUAL fields (pricing, context window, max output, release date) are sourced from each provider's own official docs and reconciled to assets/data/model-figures.json (the single source of truth); where the two disagreed, the officially-sourced figure won. The benchmarks block: swe_bench_verified and gpqa_diamond are reconciled to the official published figure where the provider published one (see model-figures.json) and are otherwise a benchr editorial ESTIMATE; gpqa_diamond is null where no official figure exists. All capability 0–100 scores and latency numbers are benchr EDITORIAL estimates, not lab measurements — see the methodology page. Re-verified 2026-06-03: Claude Opus 4.8/4.7 + Sonnet 4.6 context windows (now 1M), DeepSeek V4-Pro/Flash pricing, Mistral Medium 3.5 license/price, Qwen3.6 open-weight specs.",
  "models": [
    {
      "id": "claude-opus-4-8",
      "name": "Claude Opus 4.8",
      "company": "Anthropic",
      "type": "frontier",
      "license": "proprietary",
      "released": "2026-05-28",
      "api_name": "claude-opus-4-8",
      "review_url": "articles/claude-opus-4-8-review",
      "deprecated": false,
      "pricing": {
        "input_per_million": 5.00,
        "output_per_million": 25.00,
        "cache_input_per_million": 0.50,
        "batch_discount": 0.50,
        "fast_mode_input": 10.00,
        "fast_mode_output": 50.00
      },
      "context": { "max_tokens": 1000000, "effective_tokens": 700000, "max_output_tokens": 128000 },
      "benchmarks": {
        "swe_bench_verified": 88.6,
        "lmsys_arena": 1425,
        "mmlu": 93.5,
        "humaneval": 96.0,
        "math": 93.0,
        "gpqa_diamond": 93.6,
        "arc_agi_2": null
      },
      "latency": { "first_token_ms": 700, "tokens_per_second": 68 },
      "capabilities": { "coding": 97, "reasoning": 96, "writing": 91, "vision": 86, "long_context": 94, "multilingual": 90 },
      "best_for": ["Highest-stakes coding tasks", "Complex multi-step agents", "Architecture decisions", "Production SWE-bench-level work"],
      "skip_if": ["You need cheap volume", "Sonnet handles your workload", "Speed is the priority — use Fast Mode instead"]
    },
    {
      "id": "claude-opus-4-7",
      "name": "Claude Opus 4.7",
      "company": "Anthropic",
      "type": "frontier",
      "license": "proprietary",
      "released": "2026-04",
      "api_name": "claude-opus-4-7",
      "review_url": "articles/claude-opus-4-7-review",
      "deprecated": false,
      "pricing": {
        "input_per_million": 5.00,
        "output_per_million": 25.00,
        "cache_input_per_million": 0.50,
        "batch_discount": 0.50
      },
      "context": { "max_tokens": 1000000, "effective_tokens": 700000, "max_output_tokens": 128000 },
      "benchmarks": {
        "swe_bench_verified": 87.6,
        "lmsys_arena": 1410,
        "mmlu": 92.5,
        "humaneval": 95.2,
        "math": 91.8,
        "gpqa_diamond": 94.2,
        "arc_agi_2": 24.0
      },
      "latency": { "first_token_ms": 720, "tokens_per_second": 65 },
      "capabilities": { "coding": 96, "reasoning": 96, "writing": 90, "vision": 85, "long_context": 94, "multilingual": 89 },
      "best_for": ["Complex coding tasks", "Long-document analysis", "Production agent loops", "Architecture decisions"],
      "skip_if": ["You need cheap volume", "Simple summarization", "Sonnet covers your workload"]
    },
    {
      "id": "claude-sonnet-4-6",
      "name": "Claude Sonnet 4.6",
      "company": "Anthropic",
      "type": "mid",
      "license": "proprietary",
      "released": "2025-09",
      "api_name": "claude-sonnet-4-6",
      "review_url": "articles/claude-sonnet-4-6-review",
      "deprecated": false,
      "pricing": {
        "input_per_million": 3.00,
        "output_per_million": 15.00,
        "cache_input_per_million": 0.30,
        "batch_discount": 0.50
      },
      "context": { "max_tokens": 1000000, "effective_tokens": 700000, "max_output_tokens": 64000 },
      "benchmarks": {
        "swe_bench_verified": 79.6,
        "lmsys_arena": 1365,
        "mmlu": 89.6,
        "humaneval": 91.5,
        "math": 86.0,
        "gpqa_diamond": 89.9,
        "arc_agi_2": 15.0
      },
      "latency": { "first_token_ms": 480, "tokens_per_second": 95 },
      "capabilities": { "coding": 88, "reasoning": 87, "writing": 89, "vision": 80, "long_context": 91, "multilingual": 86 },
      "best_for": ["Production default", "Cost-effective coding", "Bulk content tasks", "Daily-driver API workloads"],
      "skip_if": ["You need frontier-grade reasoning"]
    },
    {
      "id": "claude-haiku-4-5",
      "name": "Claude Haiku 4.5",
      "company": "Anthropic",
      "type": "small",
      "license": "proprietary",
      "released": "2025-11",
      "api_name": "claude-haiku-4-5",
      "review_url": "articles/claude-haiku-4-5-review",
      "deprecated": false,
      "pricing": {
        "input_per_million": 1.00,
        "output_per_million": 5.00,
        "cache_input_per_million": 0.10,
        "batch_discount": 0.50
      },
      "context": { "max_tokens": 200000, "effective_tokens": 150000, "max_output_tokens": 64000 },
      "benchmarks": {
        "swe_bench_verified": 73.3,
        "lmsys_arena": 1285,
        "mmlu": 82.5,
        "humaneval": 85.0,
        "math": 77.0,
        "gpqa_diamond": null,
        "arc_agi_2": 8.5
      },
      "latency": { "first_token_ms": 240, "tokens_per_second": 145 },
      "capabilities": { "coding": 75, "reasoning": 76, "writing": 80, "vision": 72, "long_context": 82, "multilingual": 78 },
      "best_for": ["High-volume simple tasks", "Real-time chat", "Classification, routing, extraction"],
      "skip_if": ["Complex reasoning needed", "Long-document analysis"]
    },
    {
      "id": "gpt-5-5",
      "benchmarks_estimated": ["swe_bench_verified"],
      "name": "GPT-5.5",
      "company": "OpenAI",
      "type": "frontier",
      "license": "proprietary",
      "released": "2026-04-23",
      "api_name": "gpt-5.5",
      "review_url": "articles/gpt-5-5-review",
      "deprecated": false,
      "pricing": {
        "input_per_million": 5.00,
        "output_per_million": 30.00,
        "cache_input_per_million": 0.50,
        "batch_discount": 0.50,
        "batch_input": 2.50,
        "batch_output": 15.00
      },
      "context": { "max_tokens": 1050000, "effective_tokens": 700000, "max_output_tokens": 128000 },
      "benchmarks": {
        "swe_bench_verified": 84.0,
        "lmsys_arena": 1418,
        "mmlu": 93.0,
        "humaneval": 94.5,
        "math": 95.0,
        "gpqa_diamond": null,
        "arc_agi_2": 22.0
      },
      "latency": { "first_token_ms": 520, "tokens_per_second": 82 },
      "capabilities": { "coding": 93, "reasoning": 95, "writing": 88, "vision": 89, "long_context": 86, "multilingual": 90 },
      "best_for": ["Frontier math and reasoning", "Computer use", "Multi-step agents", "Vision + reasoning tasks"],
      "skip_if": ["Cost-sensitive workloads — use GPT-5", "Quick chat — the price is hard to justify"]
    },
    {
      "id": "gpt-5",
      "name": "GPT-5",
      "company": "OpenAI",
      "type": "frontier",
      "license": "proprietary",
      "released": "2025-08-07",
      "api_name": "gpt-5",
      "review_url": "articles/gpt-5-review",
      "deprecated": false,
      "pricing": {
        "input_per_million": 1.25,
        "output_per_million": 10.00,
        "cache_input_per_million": null,
        "batch_discount": 0.50
      },
      "context": { "max_tokens": 400000, "effective_tokens": 300000, "max_output_tokens": 128000 },
      "benchmarks": {
        "swe_bench_verified": 74.9,
        "lmsys_arena": 1375,
        "mmlu": 90.0,
        "humaneval": 91.0,
        "math": 87.0,
        "gpqa_diamond": null,
        "arc_agi_2": 16.0
      },
      "latency": { "first_token_ms": 520, "tokens_per_second": 90 },
      "capabilities": { "coding": 88, "reasoning": 88, "writing": 86, "vision": 85, "long_context": 78, "multilingual": 87 },
      "best_for": ["Production workhorse at a rational price", "Breadth tasks", "Coding agents", "The everyday OpenAI pick"],
      "skip_if": ["You need the absolute deepest reasoning — use 5.5", "Very large context windows"]
    },
    {
      "id": "gpt-5-mini",
      "benchmarks_estimated": ["swe_bench_verified"],
      "name": "GPT-5 Mini",
      "company": "OpenAI",
      "type": "small",
      "license": "proprietary",
      "released": "2025-08",
      "api_name": "gpt-5-mini",
      "review_url": null,
      "deprecated": false,
      "pricing": {
        "input_per_million": 0.25,
        "output_per_million": 2.00,
        "cache_input_per_million": 0.025,
        "batch_discount": 0.50
      },
      "context": { "max_tokens": 400000, "effective_tokens": 200000, "max_output_tokens": 128000 },
      "benchmarks": {
        "swe_bench_verified": 48.0,
        "lmsys_arena": 1275,
        "mmlu": 82.0,
        "humaneval": 78.0,
        "math": 74.0,
        "gpqa_diamond": null,
        "arc_agi_2": 6.0
      },
      "latency": { "first_token_ms": 220, "tokens_per_second": 160 },
      "capabilities": { "coding": 72, "reasoning": 74, "writing": 75, "vision": 72, "long_context": 70, "multilingual": 80 },
      "best_for": ["Cheap chat at scale", "Simple extraction and routing", "High-volume classification"],
      "skip_if": ["Production-quality code review", "Complex reasoning", "Long documents"]
    },
    {
      "id": "gemini-3-5-flash",
      "benchmarks_estimated": ["swe_bench_verified"],
      "name": "Gemini 3.5 Flash",
      "company": "Google",
      "type": "mid",
      "license": "proprietary",
      "released": "2026-05-19",
      "api_name": "gemini-3.5-flash",
      "review_url": "articles/gemini-3-5-flash-review",
      "deprecated": false,
      "pricing": {
        "input_per_million": 1.50,
        "output_per_million": 9.00,
        "cache_input_per_million": 0.15,
        "batch_discount": null,
        "free_tier": true
      },
      "context": { "max_tokens": 1048576, "effective_tokens": 700000, "max_output_tokens": 65536 },
      "benchmarks": {
        "swe_bench_verified": 80.6,
        "lmsys_arena": 1385,
        "mmlu": 89.5,
        "humaneval": 89.0,
        "math": 87.5,
        "gpqa_diamond": null,
        "arc_agi_2": 14.5
      },
      "latency": { "first_token_ms": 195, "tokens_per_second": 289 },
      "capabilities": { "coding": 88, "reasoning": 86, "writing": 84, "vision": 92, "long_context": 90, "multilingual": 91 },
      "best_for": ["Coding agents at speed", "Parallel agent execution", "Multimodal tasks", "Default frontier-quality model"],
      "skip_if": ["You need the deepest single-call reasoning — use Gemini 3.1 Pro"]
    },
    {
      "id": "gemini-3-1-pro",
      "name": "Gemini 3.1 Pro",
      "company": "Google",
      "type": "frontier",
      "license": "proprietary",
      "released": "2026-02",
      "api_name": "gemini-3.1-pro",
      "review_url": "articles/gemini-3-1-pro-review",
      "deprecated": false,
      "pricing": {
        "input_per_million": 2.00,
        "output_per_million": 12.00,
        "input_per_million_over_200k": 4.00,
        "output_per_million_over_200k": 18.00,
        "cache_input_per_million": 0.20,
        "batch_discount": null
      },
      "context": { "max_tokens": 1000000, "effective_tokens": 800000, "max_output_tokens": 64000 },
      "benchmarks": {
        "swe_bench_verified": 80.6,
        "lmsys_arena": 1380,
        "mmlu": 91.0,
        "humaneval": 90.0,
        "math": 90.0,
        "gpqa_diamond": 94.3,
        "arc_agi_2": 17.0
      },
      "latency": { "first_token_ms": 380, "tokens_per_second": 108 },
      "capabilities": { "coding": 84, "reasoning": 90, "writing": 84, "vision": 95, "long_context": 92, "multilingual": 91 },
      "best_for": ["Deep reasoning in the Gemini family", "Long-context vision work", "Workspace integration"],
      "skip_if": ["Coding agents — Flash is faster and cheaper", "Cost-sensitive workloads — note the over-200K price bump"]
    },
    {
      "id": "grok-4-3",
      "benchmarks_estimated": ["swe_bench_verified"],
      "name": "Grok 4.3",
      "company": "xAI",
      "type": "frontier",
      "license": "proprietary",
      "released": "2026-04",
      "api_name": "grok-4-3",
      "review_url": "articles/grok-4-3-review",
      "deprecated": false,
      "pricing": {
        "input_per_million": 1.25,
        "output_per_million": 2.50,
        "cache_input_per_million": 0.20,
        "batch_discount": null
      },
      "context": { "max_tokens": 1000000, "effective_tokens": 800000, "max_output_tokens": null },
      "benchmarks": {
        "swe_bench_verified": 68.0,
        "lmsys_arena": 1370,
        "mmlu": 87.0,
        "humaneval": 87.0,
        "math": 84.0,
        "gpqa_diamond": null,
        "arc_agi_2": 10.0
      },
      "latency": { "first_token_ms": 300, "tokens_per_second": 120 },
      "capabilities": { "coding": 82, "reasoning": 84, "writing": 80, "vision": 78, "long_context": 85, "multilingual": 75 },
      "best_for": ["Real-time data access via X", "Very cheap output tokens", "Long-context tasks on a budget"],
      "skip_if": ["Coding-first agent work — others edge it out", "Non-English-heavy workloads"]
    },
    {
      "id": "deepseek-v4-pro",
      "name": "DeepSeek V4-Pro",
      "company": "DeepSeek",
      "type": "frontier-open",
      "license": "MIT",
      "released": "2026-04-24",
      "api_name": "deepseek-v4-pro",
      "review_url": "articles/deepseek-review",
      "deprecated": false,
      "pricing": {
        "input_per_million": 0.435,
        "output_per_million": 0.87,
        "cache_input_per_million": 0.003625,
        "batch_discount": null,
        "self_hosted": true
      },
      "context": { "max_tokens": 1000000, "effective_tokens": 700000, "max_output_tokens": 384000 },
      "benchmarks": {
        "swe_bench_verified": 80.6,
        "lmsys_arena": 1390,
        "mmlu": 92.8,
        "humaneval": 93.5,
        "math": 96.1,
        "gpqa_diamond": 90.1,
        "arc_agi_2": 17.5
      },
      "latency": { "first_token_ms": 380, "tokens_per_second": 95 },
      "capabilities": { "coding": 93, "reasoning": 92, "writing": 84, "vision": null, "long_context": 86, "multilingual": 87 },
      "best_for": ["Frontier-grade open-weight coding", "Math-heavy work", "Self-hosted production at near-zero API cost"],
      "skip_if": ["You need vision/multimodal", "You can't manage GPU hosting"]
    },
    {
      "id": "deepseek-v4-flash",
      "name": "DeepSeek V4-Flash",
      "company": "DeepSeek",
      "type": "open",
      "license": "MIT",
      "released": "2026-04-24",
      "api_name": "deepseek-v4-flash",
      "review_url": "articles/deepseek-review",
      "deprecated": false,
      "pricing": {
        "input_per_million": 0.14,
        "output_per_million": 0.28,
        "cache_input_per_million": 0.0028,
        "batch_discount": null,
        "self_hosted": true
      },
      "context": { "max_tokens": 1000000, "effective_tokens": 600000, "max_output_tokens": 384000 },
      "benchmarks": {
        "swe_bench_verified": 79.0,
        "lmsys_arena": 1350,
        "mmlu": 88.0,
        "humaneval": 87.5,
        "math": 89.0,
        "gpqa_diamond": 88.1,
        "arc_agi_2": 11.0
      },
      "latency": { "first_token_ms": 290, "tokens_per_second": 135 },
      "capabilities": { "coding": 86, "reasoning": 86, "writing": 80, "vision": null, "long_context": 88, "multilingual": 86 },
      "best_for": ["Cheapest frontier-grade inference on the market", "Self-hosted at low cost", "Volume coding tasks"],
      "skip_if": ["You need frontier-tier reasoning depth", "Vision-first workflows"]
    },
    {
      "id": "kimi-k2-6",
      "name": "Kimi K2.6",
      "company": "Moonshot AI",
      "type": "frontier-open",
      "license": "Modified MIT",
      "released": "2026-04-20",
      "api_name": "kimi-k2-6",
      "review_url": "articles/kimi-review",
      "deprecated": false,
      "pricing": {
        "input_per_million": 0.95,
        "output_per_million": 4.00,
        "cache_input_per_million": 0.16,
        "batch_discount": null,
        "self_hosted": true
      },
      "context": { "max_tokens": 262144, "effective_tokens": 200000, "max_output_tokens": null },
      "benchmarks": {
        "swe_bench_verified": 80.2,
        "lmsys_arena": 1355,
        "mmlu": 85.0,
        "humaneval": 84.0,
        "math": 82.0,
        "gpqa_diamond": 90.5,
        "arc_agi_2": 9.0
      },
      "latency": { "first_token_ms": 350, "tokens_per_second": 100 },
      "capabilities": { "coding": 80, "reasoning": 78, "writing": 74, "vision": null, "long_context": 78, "multilingual": 82 },
      "best_for": ["Mid-range open-weight option", "Multilingual tasks", "Cost-efficient API with self-hosting option"],
      "skip_if": ["You need the deepest reasoning or best coding", "Vision workloads"]
    },
    {
      "id": "mistral-large-3",
      "benchmarks_estimated": ["swe_bench_verified"],
      "name": "Mistral Large 3",
      "company": "Mistral",
      "type": "frontier-open",
      "license": "Apache 2.0",
      "released": "2025-12-02",
      "api_name": "mistral-large-2512",
      "review_url": "articles/mistral-review",
      "deprecated": false,
      "pricing": {
        "input_per_million": 0.50,
        "output_per_million": 1.50,
        "cache_input_per_million": null,
        "batch_discount": null,
        "self_hosted": true
      },
      "context": { "max_tokens": 256000, "effective_tokens": 200000, "max_output_tokens": null },
      "benchmarks": {
        "swe_bench_verified": 62.0,
        "lmsys_arena": 1340,
        "mmlu": 84.0,
        "humaneval": 83.0,
        "math": 79.0,
        "gpqa_diamond": null,
        "arc_agi_2": 7.5
      },
      "latency": { "first_token_ms": 280, "tokens_per_second": 115 },
      "capabilities": { "coding": 78, "reasoning": 79, "writing": 78, "vision": null, "long_context": 76, "multilingual": 88 },
      "best_for": ["Apache-licensed production workloads", "European data residency", "Very cheap inference with decent reasoning"],
      "skip_if": ["Coding at frontier quality", "Vision or multimodal workflows"]
    },
    {
      "id": "mistral-medium-3-5",
      "benchmarks_estimated": ["swe_bench_verified"],
      "name": "Mistral Medium 3.5",
      "company": "Mistral",
      "type": "frontier-open",
      "license": "Modified MIT",
      "released": "2026-04",
      "api_name": "mistral-medium-3-5",
      "review_url": null,
      "deprecated": false,
      "pricing": {
        "input_per_million": 1.50,
        "output_per_million": 7.50,
        "cache_input_per_million": null,
        "self_hosted": true
      },
      "context": { "max_tokens": 256000, "effective_tokens": 200000, "max_output_tokens": null },
      "benchmarks": {
        "swe_bench_verified": 77.6,
        "lmsys_arena": 1368,
        "mmlu": 89.0,
        "humaneval": 88.0,
        "math": 86.0,
        "gpqa_diamond": null,
        "arc_agi_2": 12.0
      },
      "latency": { "first_token_ms": 320, "tokens_per_second": 105 },
      "capabilities": { "coding": 86, "reasoning": 87, "writing": 86, "vision": 84, "long_context": 84, "multilingual": 92 },
      "best_for": ["European data residency", "Multimodal + reasoning in one model", "Self-hosted on 4 GPUs"],
      "skip_if": ["You need MoE efficiency for ultra-cheap inference"]
    },
    {
      "id": "qwen-3-6-27b",
      "name": "Qwen3.6-27B",
      "company": "Alibaba",
      "type": "open",
      "license": "Apache 2.0",
      "released": "2026-04-22",
      "api_name": "Qwen/Qwen3.6-27B",
      "review_url": "articles/qwen-review",
      "deprecated": false,
      "pricing": {
        "input_per_million": null,
        "output_per_million": null,
        "self_hosted": true
      },
      "context": { "max_tokens": 262144, "effective_tokens": 200000, "max_output_tokens": null },
      "benchmarks": {
        "swe_bench_verified": 77.2,
        "lmsys_arena": 1370,
        "mmlu": 89.5,
        "humaneval": 89.0,
        "math": 87.0,
        "gpqa_diamond": 87.8,
        "arc_agi_2": 12.0
      },
      "latency": { "first_token_ms": 340, "tokens_per_second": 105 },
      "capabilities": { "coding": 88, "reasoning": 86, "writing": 82, "vision": 82, "long_context": 84, "multilingual": 95 },
      "best_for": ["Multilingual coding (Chinese, Japanese, Korean, Arabic)", "Local inference on consumer GPUs — dense 27B", "Tool-use agent loops at zero API cost"],
      "skip_if": ["You want a managed hosted API — these are open weights you self-host", "Absolute deepest single-language reasoning"]
    },
    {
      "id": "llama-4-maverick",
      "benchmarks_estimated": ["swe_bench_verified"],
      "name": "Llama 4 Maverick",
      "company": "Meta",
      "type": "frontier-open",
      "license": "Llama 4 Community",
      "released": "2025-04-05",
      "api_name": "llama-4-maverick",
      "review_url": "articles/llama-4-review",
      "deprecated": false,
      "pricing": {
        "input_per_million": null,
        "output_per_million": null,
        "self_hosted": true
      },
      "context": { "max_tokens": 1000000, "effective_tokens": 700000, "max_output_tokens": null },
      "benchmarks": {
        "swe_bench_verified": 66.0,
        "lmsys_arena": 1340,
        "mmlu": 84.0,
        "humaneval": 83.0,
        "math": 80.0,
        "gpqa_diamond": 69.8,
        "arc_agi_2": 9.0
      },
      "latency": { "first_token_ms": 300, "tokens_per_second": 120 },
      "capabilities": { "coding": 80, "reasoning": 81, "writing": 76, "vision": 80, "long_context": 85, "multilingual": 80 },
      "best_for": ["Best open-weight frontier model", "Self-hosted production at zero licensing cost", "Multimodal at no API cost"],
      "skip_if": ["You need the very best reasoning or coding", "You can't manage GPU infrastructure"]
    },
    {
      "id": "llama-4-scout",
      "benchmarks_estimated": ["swe_bench_verified"],
      "name": "Llama 4 Scout",
      "company": "Meta",
      "type": "open",
      "license": "Llama 4 Community",
      "released": "2025-04-05",
      "api_name": "llama-4-scout",
      "review_url": "articles/llama-4-review",
      "deprecated": false,
      "pricing": {
        "input_per_million": null,
        "output_per_million": null,
        "self_hosted": true
      },
      "context": { "max_tokens": 10000000, "effective_tokens": 2000000, "max_output_tokens": null },
      "benchmarks": {
        "swe_bench_verified": 56.0,
        "lmsys_arena": 1295,
        "mmlu": 80.0,
        "humaneval": 76.0,
        "math": 74.0,
        "gpqa_diamond": 57.2,
        "arc_agi_2": 6.0
      },
      "latency": { "first_token_ms": 180, "tokens_per_second": 180 },
      "capabilities": { "coding": 73, "reasoning": 74, "writing": 70, "vision": 75, "long_context": 92, "multilingual": 78 },
      "best_for": ["Ultra-long-context tasks (10M token window)", "Fast self-hosted inference", "Free multimodal at scale"],
      "skip_if": ["Deep reasoning tasks", "When frontier coding quality is needed"]
    },
    {
      "id": "phi-4",
      "benchmarks_estimated": ["swe_bench_verified"],
      "name": "Phi-4",
      "company": "Microsoft",
      "type": "small-open",
      "license": "MIT",
      "released": "2025-01",
      "api_name": "phi-4",
      "review_url": null,
      "deprecated": false,
      "pricing": {
        "input_per_million": null,
        "output_per_million": null,
        "self_hosted": true
      },
      "context": { "max_tokens": 16000, "effective_tokens": 14000, "max_output_tokens": null },
      "benchmarks": {
        "swe_bench_verified": 30.0,
        "lmsys_arena": 1180,
        "mmlu": 84.8,
        "humaneval": 82.6,
        "math": 80.4,
        "gpqa_diamond": 56.1,
        "arc_agi_2": 5.0
      },
      "latency": { "first_token_ms": 95, "tokens_per_second": 220 },
      "capabilities": { "coding": 70, "reasoning": 78, "writing": 74, "vision": null, "long_context": 60, "multilingual": 72 },
      "best_for": ["Local inference on consumer hardware", "Edge deployment", "Reasoning at tiny scale"],
      "skip_if": ["Long-context tasks", "Production-quality writing or coding"]
    }
  ],
  "dimensions": [
    {
      "id": "pricing",
      "label": "Pricing",
      "icon": "$",
      "sub_dimensions": [
        { "id": "input_per_million", "label": "Input / 1M tokens", "unit": "$", "format": "currency", "lower_better": true },
        { "id": "output_per_million", "label": "Output / 1M tokens", "unit": "$", "format": "currency", "lower_better": true },
        { "id": "cache_input_per_million", "label": "Cached input / 1M tokens", "unit": "$", "format": "currency", "lower_better": true }
      ]
    },
    {
      "id": "context",
      "label": "Context window",
      "icon": "↔",
      "sub_dimensions": [
        { "id": "max_tokens", "label": "Max tokens", "unit": "tokens", "format": "number", "lower_better": false },
        { "id": "effective_tokens", "label": "Effective retrieval zone", "unit": "tokens", "format": "number", "lower_better": false },
        { "id": "max_output_tokens", "label": "Max output tokens", "unit": "tokens", "format": "number", "lower_better": false }
      ]
    },
    {
      "id": "benchmarks",
      "label": "Benchmarks",
      "icon": "★",
      "sub_dimensions": [
        { "id": "swe_bench_verified", "label": "SWE-bench Verified", "unit": "%", "format": "percent", "lower_better": false },
        { "id": "lmsys_arena", "label": "LMSYS Arena", "unit": "score", "format": "number", "lower_better": false },
        { "id": "mmlu", "label": "MMLU", "unit": "%", "format": "percent", "lower_better": false },
        { "id": "humaneval", "label": "HumanEval", "unit": "%", "format": "percent", "lower_better": false },
        { "id": "math", "label": "MATH", "unit": "%", "format": "percent", "lower_better": false },
        { "id": "gpqa_diamond", "label": "GPQA Diamond", "unit": "%", "format": "percent", "lower_better": false },
        { "id": "arc_agi_2", "label": "ARC-AGI 2", "unit": "%", "format": "percent", "lower_better": false }
      ]
    },
    {
      "id": "latency",
      "label": "Speed",
      "icon": "⚡",
      "sub_dimensions": [
        { "id": "first_token_ms", "label": "First token (ms)", "unit": "ms", "format": "number", "lower_better": true },
        { "id": "tokens_per_second", "label": "Tokens / second", "unit": "tok/s", "format": "number", "lower_better": false }
      ]
    },
    {
      "id": "capabilities",
      "label": "Capabilities (0–10)",
      "icon": "◉",
      "sub_dimensions": [
        { "id": "coding", "label": "Coding", "unit": "/10", "format": "number", "lower_better": false },
        { "id": "reasoning", "label": "Reasoning", "unit": "/10", "format": "number", "lower_better": false },
        { "id": "writing", "label": "Writing", "unit": "/10", "format": "number", "lower_better": false },
        { "id": "vision", "label": "Vision", "unit": "/10", "format": "number", "lower_better": false },
        { "id": "long_context", "label": "Long context", "unit": "/10", "format": "number", "lower_better": false },
        { "id": "multilingual", "label": "Multilingual", "unit": "/10", "format": "number", "lower_better": false }
      ]
    }
  ]
}
