{"total_count":1390,"offset":0,"limit":100,"data":[{"id":"leezx4dh5k0ox7y80c5ca1cc","name":"coverageproof-env","description":"Certified document-QA RL environment: span-F1 + absence-precision rewards from machine-recheckable certificates","visibility":"PUBLIC","owner":{"type":"user","name":"sovnode"},"created_at":"2026-07-20T04:27:01.785000","updated_at":"2026-07-20T04:27:04.283000","tags":["rl","rl-environment","document-qa","evaluation","abstention","grounding","hallucination"],"stars":0,"latest_ci_status":null,"latest_version":"0.1.6"},{"id":"k501avu86y3xglce8cp15hrz","name":"iol-two-pass-rl","description":"Strict row-level two-pass RL aligned with IOL-AI competition inference","visibility":"PUBLIC","owner":{"type":"user","name":"cardan05"},"created_at":"2026-07-19T18:44:01.939000","updated_at":"2026-07-19T21:56:58.019000","tags":["linguistics","reasoning","multi-turn","train","eval"],"stars":0,"latest_ci_status":null,"latest_version":"0.6.0"},{"id":"hqox7el7t3xnijl2rpzyup10","name":"llm-sudoku","description":"Multi-turn Sudoku with reusable oracle datasets and normalized blank-cell accuracy rewards","visibility":"PUBLIC","owner":{"type":"user","name":"hyperpotatoneo"},"created_at":"2026-07-19T13:57:50.197000","updated_at":"2026-07-19T19:32:38.652000","tags":["multi-turn","reasoning","sudoku","privileged-information","train","eval"],"stars":0,"latest_ci_status":null,"latest_version":"0.2.0"},{"id":"r7kupw9zwujyri73iqoxf6q7","name":"writing-judge","description":"RL env: train a judge to spot planted writing violations; reward = deterministic span-F1 vs labels-by-construction (no LLM in the loop)","visibility":"PUBLIC","owner":{"type":"user","name":"rishigundakaram"},"created_at":"2026-07-19T00:55:42.530000","updated_at":"2026-07-19T01:40:35.265000","tags":["rl","single-turn","writing","judge","verifiable-reward"],"stars":0,"latest_ci_status":null,"latest_version":"0.2.0"},{"id":"q8b4go7p8h8akbagzacrus66","name":"muni-bdbv-gp","description":"Agentic antiviral design against Bundibugyo ebolavirus GP entry — sandboxed Muni/OnePot tool use with onepot-CORE plausibility reward","visibility":"PUBLIC","owner":{"type":"team","name":"prime"},"created_at":"2026-07-19T00:42:13.556000","updated_at":"2026-07-20T22:32:42.003000","tags":["multi-turn","sandbox","tool-use","drug-discovery","ebolathon"],"stars":0,"latest_ci_status":null,"latest_version":"0.2.1"},{"id":"oz7c8vzgr72mvx2yj5rw8uet","name":"writing-rewrite","description":"RL env: rewrite messy drafts into plain English; reward = within-group clarity rank x content-preservation gate (LLM judges)","visibility":"PUBLIC","owner":{"type":"user","name":"rishigundakaram"},"created_at":"2026-07-18T23:41:58.520000","updated_at":"2026-07-19T04:20:32.655000","tags":["rl","single-turn","writing","llm-judge"],"stars":0,"latest_ci_status":null,"latest_version":"0.1.3"},{"id":"b09ma1ruqrk66gv788jzg4nl","name":"gpu-deal-judge-v1","description":"PC-purchase decision benchmark for Decision Frontier.","visibility":"PUBLIC","owner":{"type":"user","name":"tarive"},"created_at":"2026-07-18T22:53:07.745000","updated_at":"2026-07-18T23:13:47.028000","tags":["single-turn","commerce","gpu","macbook","ram","v1"],"stars":1,"latest_ci_status":null,"latest_version":"0.1.1"},{"id":"dogs0zcwfopescimf89lrrub","name":"gpu-deal-judge","description":"GPU purchase-decision judge environment (AITX SAT 2026) - episodic-memory RSI eval","visibility":"PUBLIC","owner":{"type":"user","name":"tarive"},"created_at":"2026-07-18T22:33:45.762000","updated_at":"2026-07-18T22:47:18.390000","tags":[],"stars":0,"latest_ci_status":null,"latest_version":"0.1.4"},{"id":"c041ba0c7sep75h6x11aguxp","name":"crayfish","description":"RL Environment to train LLM to play chess against Stockfish, and WIN","visibility":"PUBLIC","owner":{"type":"user","name":"cletusigwe"},"created_at":"2026-07-17T16:38:31.184000","updated_at":"2026-07-17T23:20:33.159000","tags":["placeholder-tag","train","eval"],"stars":0,"latest_ci_status":null,"latest_version":"0.1.9"},{"id":"zwchgkzwjfrn8xlxn45y5m2n","name":"git-surgery","description":"git-surgery — hard git-repair tasks for a terminal agent, with deterministic, hack-resistant rewards.","visibility":"PUBLIC","owner":{"type":"user","name":"rohseh303"},"created_at":"2026-07-17T05:25:30.665000","updated_at":"2026-07-17T05:25:34.516000","tags":["terminal","coding","tool-use","git","agentic"],"stars":0,"latest_ci_status":null,"latest_version":"0.1.0"},{"id":"krtg09rj5ux36ksgb2xofi3q","name":"compressed-gsm8k-public","description":"GSM8K with group-relative rewards for compressed correct reasoning.","visibility":"PUBLIC","owner":{"type":"user","name":"will"},"created_at":"2026-07-16T08:19:44.495000","updated_at":"2026-07-16T08:51:11.733000","tags":["math","reasoning","train","eval"],"stars":0,"latest_ci_status":null,"latest_version":"0.1.12"},{"id":"vbcwljlrmd0z189aakwlo6k7","name":"compressed-gsm8k-sprints","description":"GSM8K with group-relative rewards for compressed correct reasoning.","visibility":"PUBLIC","owner":{"type":"user","name":"will"},"created_at":"2026-07-16T08:01:40.541000","updated_at":"2026-07-16T08:04:27.891000","tags":["math","reasoning","train","eval"],"stars":0,"latest_ci_status":null,"latest_version":"0.1.9"},{"id":"ka1cuxjfpy0kvn1ug04l8ld3","name":"SLM-Model-Understand","description":"Your environment description here","visibility":"PUBLIC","owner":{"type":"user","name":"slm"},"created_at":"2026-07-16T06:07:53.030000","updated_at":"2026-07-16T06:11:38.883000","tags":["placeholder-tag","train","eval"],"stars":0,"latest_ci_status":null,"latest_version":"0.1.0"},{"id":"cw7w3s4nv45nj5w67o2c8qg1","name":"fs-review","description":"Whole-financial-statement review: spot planted tie-out defects across the statements.","visibility":"PUBLIC","owner":{"type":"user","name":"martian56"},"created_at":"2026-07-15T17:00:55.649000","updated_at":"2026-07-15T17:28:27.950000","tags":["finance","audit","reasoning","train","eval"],"stars":5,"latest_ci_status":null,"latest_version":"0.1.0"},{"id":"iwllgss0fztu2gev79xra8u1","name":"kimi-native","description":"Verifiers v1 harness for a pinned native Kimi Code CLI","visibility":"PUBLIC","owner":{"type":"user","name":"eliebakouch"},"created_at":"2026-07-15T14:06:46.617000","updated_at":"2026-07-15T14:06:49.065000","tags":["kimi","harness","agentic","tool-use","eval"],"stars":0,"latest_ci_status":null,"latest_version":"0.1.0"},{"id":"mnwbbzaac1wii11ll3r4cg7q","name":"codex-native","description":"Verifiers v1 harness for a pinned native Codex CLI","visibility":"PUBLIC","owner":{"type":"user","name":"eliebakouch"},"created_at":"2026-07-15T14:06:42.271000","updated_at":"2026-07-15T14:11:26.574000","tags":["codex","harness","agentic","tool-use","eval"],"stars":0,"latest_ci_status":null,"latest_version":"0.1.1"},{"id":"qunn6ek03tb0oqmp6xoz8ubh","name":"crossword-agentic","description":"Answer-hidden agentic harness evaluation for clue-free crosswords","visibility":"PUBLIC","owner":{"type":"user","name":"eliebakouch"},"created_at":"2026-07-15T14:06:37.500000","updated_at":"2026-07-15T14:06:40.137000","tags":["crossword","constraint-satisfaction","agentic","tool-use","eval"],"stars":0,"latest_ci_status":null,"latest_version":"0.1.0"},{"id":"l8fto9hiwlgln3bhw2e86fwc","name":"beads-combined-cycles-v1","description":"A self-contained repository-editing task for Beads combined scheduling-cycle safety.","visibility":"PUBLIC","owner":{"type":"team","name":"shortbread"},"created_at":"2026-07-15T13:44:01.247000","updated_at":"2026-07-16T17:12:02.244000","tags":["coding","multi-turn","sandbox","go","database","v1"],"stars":0,"latest_ci_status":null,"latest_version":"0.5.0"},{"id":"q65twob3jul23wbvpykcgqy9","name":"crossword-eval-frontier","description":"Deterministic textual evaluation for clue-free themed crosswords","visibility":"PUBLIC","owner":{"type":"user","name":"eliebakouch"},"created_at":"2026-07-14T19:18:50.743000","updated_at":"2026-07-16T15:12:23.171000","tags":["crossword","constraint-satisfaction","reasoning","single-turn","eval"],"stars":0,"latest_ci_status":null,"latest_version":"0.10.0"},{"id":"mlyj5fh2pzgnxa62cmq99tmt","name":"tmax-opsd-v1","description":"TMax terminal tasks as a verifiers-v1 environment with a TMax-faithful vanillux harness, built for OPSD experiments.","visibility":"PUBLIC","owner":{"type":"user","name":"vivek"},"created_at":"2026-07-14T15:00:05.799000","updated_at":"2026-07-15T18:34:06.480000","tags":["train","eval","terminal","agent","tmax","opsd"],"stars":1,"latest_ci_status":null,"latest_version":"0.2.4"},{"id":"oopehcha2tg7m7zdes9e4hmc","name":"feishu-office-v1","description":"Simulated Feishu (Lark) workspace tasks in Chinese, scored by deterministic checks on final workspace state.","visibility":"PUBLIC","owner":{"type":"user","name":"emerge"},"created_at":"2026-07-14T12:13:08.715000","updated_at":"2026-07-16T15:19:20.976000","tags":["feishu","chinese","tool-use","multi-turn","agentic","train","eval","v1"],"stars":0,"latest_ci_status":null,"latest_version":"0.4.1"},{"id":"kthfkqc1xfie0qqnlzz9qs2v","name":"glyph-pipelines","description":"Program synthesis by selection: emit one-line programs in a content-addressed composition language; reward is machine-checked on held-out I/O","visibility":"PUBLIC","owner":{"type":"user","name":"sweetisland"},"created_at":"2026-07-14T11:30:52.277000","updated_at":"2026-07-14T11:30:55.391000","tags":[],"stars":0,"latest_ci_status":null,"latest_version":"0.1.0"},{"id":"qd0r1zk5vayt59owhv37p2af","name":"irreps-tensor-product","description":"Standalone O(3) irreps tensor product environment with exact scoring","visibility":"PUBLIC","owner":{"type":"user","name":"not-enough-alive"},"created_at":"2026-07-13T12:08:05.487000","updated_at":"2026-07-13T12:08:08.313000","tags":["math","representation-theory","single-turn","eval"],"stars":0,"latest_ci_status":null,"latest_version":"0.1.0"},{"id":"skx59lkk7m15o9fs95yx5ly5","name":"inbox-voice","description":"Score whether an email draft sounds like a specific person and says what they would say, judged against their real held-out email. Two tasks: reply...","visibility":"PUBLIC","owner":{"type":"user","name":"nitinm21"},"created_at":"2026-07-12T20:47:32.200000","updated_at":"2026-07-12T20:47:35.505000","tags":["email","style-transfer","personalization","llm-judge","eval","train"],"stars":0,"latest_ci_status":null,"latest_version":"0.1.3"},{"id":"gl0v4cku076jwll8lccr2mh8","name":"medfact-bench","description":"Paper-faithful zero-shot evaluation and reporting for MedFact-Bench","visibility":"PUBLIC","owner":{"type":"user","name":"kouatemuhamed"},"created_at":"2026-07-12T14:01:52.137000","updated_at":"2026-07-12T23:04:33.315000","tags":["medical","fact-verification","evidence-attribution","evaluation"],"stars":0,"latest_ci_status":null,"latest_version":"0.1.1"},{"id":"b4lgd428i7av02i3fflr6ww9","name":"wordle","description":"Multi-turn Wordle played through a guess tool, with shaped solve + green-progress rewards.","visibility":"PUBLIC","owner":{"type":"user","name":"apple999"},"created_at":"2026-07-12T13:47:00.276000","updated_at":"2026-07-12T14:55:18.752000","tags":["multi-turn","tool-use","games","train","eval"],"stars":0,"latest_ci_status":null,"latest_version":"0.1.1"},{"id":"zzvqigpr5ffo2dz4fpel5qh8","name":"tripwire","description":"A stateful SQL data-agent gym where a naive final-answer verifier passes a cheating agent that a layered trace+state verifier correctly fails — ver...","visibility":"PUBLIC","owner":{"type":"team","name":"vedant"},"created_at":"2026-07-11T21:14:35.546000","updated_at":"2026-07-11T21:27:06.353000","tags":["multi-turn","tool-use","sql","verifier-density","anti-reward-hacking","eval"],"stars":0,"latest_ci_status":null,"latest_version":"0.1.1"},{"id":"u7wk0z3n3ed0t7p7g5lzye5e","name":"canary-bench","description":"Credential-honeypot distinguishability evaluation for evasion-aware coding agents","visibility":"PUBLIC","owner":{"type":"user","name":"yadnyesh"},"created_at":"2026-07-11T18:58:56.278000","updated_at":"2026-07-11T19:10:01.597000","tags":["ai-security","honeypot","insider-threat","eval"],"stars":0,"latest_ci_status":null,"latest_version":"0.1.1"},{"id":"o0h5k3kogrwsfv5vaep4wewh","name":"isaf-extraction","description":"Extract province + casualty counts from ISAF press releases; reward priced by a measured error taxonomy, with partial credit for districts of the r...","visibility":"PUBLIC","owner":{"type":"user","name":"strickvl"},"created_at":"2026-07-11T16:27:39.860000","updated_at":"2026-07-13T10:44:46.926000","tags":["extraction","json","single-turn","structured-output"],"stars":1,"latest_ci_status":null,"latest_version":"0.2.3"},{"id":"wdndfjrnkli11lzjwrcf29wn","name":"pdf-form-fill","description":"A procedural agentic environment for spatially matching labels to PDF form fields and filling them legibly","visibility":"PUBLIC","owner":{"type":"user","name":"dp-learning-rl"},"created_at":"2026-07-11T10:14:06.124000","updated_at":"2026-07-12T23:15:27.432000","tags":["agentic","pdf","forms","tool-use","procedural"],"stars":0,"latest_ci_status":null,"latest_version":"0.7.0"},{"id":"uv1jmwd9kubq054jaac18pzc","name":"crypto-law-howey","description":"Leakage-resistant synthetic crypto-offering Howey-factor triage for small-model RL and evaluation","visibility":"PUBLIC","owner":{"type":"user","name":"narcolepticchicken"},"created_at":"2026-07-10T23:26:28.737000","updated_at":"2026-07-13T07:48:00.172000","tags":["legal","crypto-law","securities","structured-output","train","eval"],"stars":0,"latest_ci_status":null,"latest_version":"0.3.0"},{"id":"vlgkhs623xf3laro4ucuskqo","name":"uav-operator","description":"A multi-turn UAV operations environment where models manage drone missions under wind, airspace, and failure events.","visibility":"PUBLIC","owner":{"type":"user","name":"jarrett"},"created_at":"2026-07-10T08:30:23.578000","updated_at":"2026-07-14T12:18:13.199000","tags":["uav","drone-operations","multi-turn","tool-use","train","eval"],"stars":1,"latest_ci_status":null,"latest_version":"0.1.1"},{"id":"l4qk5gfyshfoptlolog6mmt8","name":"nemotron-r2e-verified-v1","description":"Pinned full 4,522-task R2E-Gym Verified taskset for the Nemotron 3 tutorial.","visibility":"PUBLIC","owner":{"type":"user","name":"llm-wizard"},"created_at":"2026-07-10T06:42:35.882000","updated_at":"2026-07-10T07:04:38.898000","tags":["coding","swe","agentic","sandbox","rl","v1"],"stars":0,"latest_ci_status":null,"latest_version":"0.1.3"},{"id":"hz8vkznoqqnfip4hxcmw57ll","name":"gate-runner","description":"Honesty-shaped strategy design with walk-forward DSR and CSCV/PBO grading","visibility":"PUBLIC","owner":{"type":"user","name":"br-322"},"created_at":"2026-07-10T04:47:47.674000","updated_at":"2026-07-10T05:39:27.575000","tags":["finance","honest-evaluation","train","eval"],"stars":1,"latest_ci_status":null,"latest_version":"0.2.0"},{"id":"fburzhcqglnjn3xfo9mwdxx8","name":"mini-swe-agent-plus","description":"Mini SWE Agent Plus environment for solving SWE issues inside Prime Sandboxes.","visibility":"PUBLIC","owner":{"type":"user","name":"llm-wizard"},"created_at":"2026-07-09T19:38:30.429000","updated_at":"2026-07-09T19:38:33.127000","tags":["swe","multi-turn","sandbox"],"stars":0,"latest_ci_status":null,"latest_version":"0.2.26"},{"id":"f7ccz15lgrmtmjp3q8mhptiw","name":"creative-writing-local","description":"Creative writing with free heuristic rewards (format, length, element coverage)","visibility":"PUBLIC","owner":{"type":"user","name":"mparramont"},"created_at":"2026-07-09T09:28:09.963000","updated_at":"2026-07-09T14:37:09.145000","tags":["creative-writing","heuristic","train","eval"],"stars":0,"latest_ci_status":null,"latest_version":"0.1.4"},{"id":"ktn9ryup6vpd7hdm7gz6096h","name":"drug-tool-rl","description":"Drug-discovery tool-use RL environment for biomedical agent post-training.","visibility":"PUBLIC","owner":{"type":"team","name":"lokahq"},"created_at":"2026-07-08T22:27:59.580000","updated_at":"2026-07-08T23:21:10.681000","tags":["biomedical","tool-use","agentic","rl","train","eval"],"stars":0,"latest_ci_status":null,"latest_version":"1.0.0"},{"id":"mngubbp7her86nxa6xdsl5yy","name":"bioreason-go-rl","description":"BioReason Gene Ontology RL environment for biomedical reasoning post-training.","visibility":"PUBLIC","owner":{"type":"team","name":"lokahq"},"created_at":"2026-07-08T22:27:50.706000","updated_at":"2026-07-08T23:21:01.162000","tags":["biomedical","reasoning","go-annotation","rl","train","eval"],"stars":0,"latest_ci_status":null,"latest_version":"1.0.0"},{"id":"z5z9fmftpj3n8uiqc87klrjq","name":"ai-law-legal-agent-eval","description":"The frontier of the frontier: an AI agent counseling on AI export controls — advanced-chip shipments, frontier model-weight storage abroad, cloud t...","visibility":"PUBLIC","owner":{"type":"user","name":"narcolepticchicken"},"created_at":"2026-07-08T21:08:43.560000","updated_at":"2026-07-08T21:08:46.694000","tags":["legal","ai-law","tool-use","multi-turn","eval","agent"],"stars":0,"latest_ci_status":null,"latest_version":"0.1.0"},{"id":"let27krlq7sd11hn2xmv62h7","name":"crypto-law-legal-agent-eval","description":"Drops an AI agent into digital-asset matters with real regulatory teeth: DeFi front-end sanctions exposure, mixer-tainted treasuries in token M&A, ...","visibility":"PUBLIC","owner":{"type":"user","name":"narcolepticchicken"},"created_at":"2026-07-08T21:08:38.471000","updated_at":"2026-07-10T21:36:35.783000","tags":["legal","crypto-law","tool-use","multi-turn","eval","agent"],"stars":0,"latest_ci_status":null,"latest_version":"0.3.2"},{"id":"m4l5ax4gubae92bmsa9kv6s5","name":"employment-law-legal-agent-eval","description":"Can an AI agent catch what employment counsel must catch? Noncompete rollouts against a shifting FTC landscape, exemption misclassification, stay-o...","visibility":"PUBLIC","owner":{"type":"user","name":"narcolepticchicken"},"created_at":"2026-07-08T21:08:32.345000","updated_at":"2026-07-13T22:45:10.426000","tags":["legal","employment-law","tool-use","multi-turn","eval","agent"],"stars":0,"latest_ci_status":null,"latest_version":"0.2.1"},{"id":"edt1qc45qpozpfn48j980nkw","name":"nemotron-r2e-mini-v1","description":"Self-contained fixed 12-task R2E-Gym training subset for the Nemotron SWE tutorial.","visibility":"PUBLIC","owner":{"type":"user","name":"llm-wizard"},"created_at":"2026-07-08T20:48:37.781000","updated_at":"2026-07-08T20:57:24.534000","tags":["coding","swe","agentic","multi-turn","sandbox","rl","v1"],"stars":0,"latest_ci_status":null,"latest_version":"0.1.3"},{"id":"pzx72tffkj9usuivtte9i96b","name":"corporate-governance-legal-agent-eval","description":"Tests an AI agent on board-level Delaware questions: stockholder-agreement authority limits, controller conflicts and cleansing under DGCL 144, boo...","visibility":"PUBLIC","owner":{"type":"user","name":"narcolepticchicken"},"created_at":"2026-07-08T04:41:10.981000","updated_at":"2026-07-08T21:08:30.275000","tags":["legal","corporate-governance","tool-use","multi-turn","eval","agent"],"stars":0,"latest_ci_status":null,"latest_version":"0.1.1"},{"id":"w2044lmip8r0q6khj6qsbmqu","name":"cybersecurity-privacy-legal-agent-eval","description":"Puts an AI agent inside live incident-response matters: SEC Item 1.05 materiality calls, health-data breach notices, CIRCIA critical-infrastructure...","visibility":"PUBLIC","owner":{"type":"user","name":"narcolepticchicken"},"created_at":"2026-07-08T04:41:05.398000","updated_at":"2026-07-08T21:08:24.753000","tags":["legal","cybersecurity-privacy","tool-use","multi-turn","eval","agent"],"stars":0,"latest_ci_status":null,"latest_version":"0.1.1"},{"id":"zmyvsztjpjioeqz3xtbgcjrh","name":"reward-schedule-ifeval","description":"Reward hacking sprint: does intermittent reinforcement (variable/fixed-ratio schedules) make a reward hack more resistant to extinction than contin...","visibility":"PUBLIC","owner":{"type":"user","name":"adamnoonan"},"created_at":"2026-07-08T00:10:09.163000","updated_at":"2026-07-08T00:10:12.210000","tags":["reward-hacking","backdoor","instruction-following","extinction","schedules","train","eval"],"stars":1,"latest_ci_status":null,"latest_version":"0.1.0"},{"id":"hb3q8ymgeu1hl6d9npjo9qqb","name":"mergers-acquisitions-legal-agent-eval","description":"Tests whether an AI agent can work an M&A dataroom: HSR filing readiness, antitrust risk, earnout disputes, indemnity notices, and closing covenant...","visibility":"PUBLIC","owner":{"type":"user","name":"narcolepticchicken"},"created_at":"2026-07-07T23:44:00.713000","updated_at":"2026-07-08T21:08:19.656000","tags":["legal","mergers-acquisitions","tool-use","multi-turn","eval","agent"],"stars":0,"latest_ci_status":null,"latest_version":"0.1.2"},{"id":"sk83bjxic8m9hu6r6i2vb3zr","name":"unit-commitment-lite","description":"Day-ahead binary unit-commitment RL environment: choose the T-by-G ON/OFF schedule that serves an hourly load at minimum total cost (energy + start...","visibility":"PUBLIC","owner":{"type":"team","name":"jwilksbooth"},"created_at":"2026-07-07T18:26:57.107000","updated_at":"2026-07-07T18:33:25.018000","tags":["single-turn","power-systems","optimization","scheduling","train","eval"],"stars":0,"latest_ci_status":null,"latest_version":"0.1.0"},{"id":"z969zcu6ml5jbtdrn93ldlqe","name":"contracts-legal-agent-eval","description":"Can an AI agent review a contracts dataroom like a careful junior associate? Five realistic matters (SaaS, supply, subscription, services, construc...","visibility":"PUBLIC","owner":{"type":"user","name":"narcolepticchicken"},"created_at":"2026-07-07T18:20:16.505000","updated_at":"2026-07-10T17:39:08.045000","tags":["legal","contracts","tool-use","multi-turn","eval","agent"],"stars":0,"latest_ci_status":null,"latest_version":"0.3.1"},{"id":"ro3ndalx0q47otbk2jro6xzt","name":"nodal-pricing-lmp","description":"Locational marginal pricing (LMP) dual-space RL environment: given a congested lossless DC network, the model must produce the economic readout of ...","visibility":"PUBLIC","owner":{"type":"team","name":"jwilksbooth"},"created_at":"2026-07-07T03:55:05.905000","updated_at":"2026-07-07T04:52:30.717000","tags":["single-turn","energy","power-systems","optimization","train","eval"],"stars":0,"latest_ci_status":null,"latest_version":"0.1.0"},{"id":"o83qmn5x8cnm8riwh411qmee","name":"are-gaia2","description":"Meta ARE (Agents Research Environments) scenarios as a verifiers ToolEnv with a token-metered simulated clock for async-RL-safe temporal semantics.","visibility":"PUBLIC","owner":{"type":"team","name":"rrajasek95"},"created_at":"2026-07-07T02:49:05.243000","updated_at":"2026-07-09T00:59:24.134000","tags":["tool-use","multi-turn","agent","temporal","gaia2","train","eval"],"stars":0,"latest_ci_status":null,"latest_version":"0.2.7"},{"id":"r80enrg6isxsqmguvf1d3a31","name":"bimanual-chore-scheduling","description":"Bimanual household-robot chore scheduling RL environment: a two-arm robot must finish a chore set (laundry chain, dish batches, fillers) in minimum...","visibility":"PUBLIC","owner":{"type":"team","name":"jwilksbooth"},"created_at":"2026-07-07T02:00:21.411000","updated_at":"2026-07-07T04:52:37.898000","tags":["single-turn","robotics","scheduling","optimization","train","eval"],"stars":0,"latest_ci_status":null,"latest_version":"0.1.0"},{"id":"t7cv5blovm5i1agxdf5pc1gx","name":"blind-goldilocks-ifeval","description":"Reward hacking sprint: can a hack-BLIND difficulty controller (within-group visible variance only) suppress reward hacking? Ablation: static vs bli...","visibility":"PUBLIC","owner":{"type":"user","name":"adamnoonan"},"created_at":"2026-07-06T22:30:16.190000","updated_at":"2026-07-06T22:43:07.775000","tags":["reward-hacking","backdoor","instruction-following","curriculum","adaptive","train","eval"],"stars":0,"latest_ci_status":null,"latest_version":"0.2.0"},{"id":"zkfxht6g7yjuae4qialvrd88","name":"gyrell","description":"Turn-game RL environments on the turngames kernel for RL'ing towards Theory of Mind and language adaptability.","visibility":"PUBLIC","owner":{"type":"user","name":"lumpenspace"},"created_at":"2026-07-06T01:54:12.212000","updated_at":"2026-07-06T02:06:52.942000","tags":["game","multi-turn","hidden-information","train","eval"],"stars":0,"latest_ci_status":null,"latest_version":"0.2.1"},{"id":"hr1wrcye2kbq874jt9x430t1","name":"patent-office-action-v1","description":"Respond to a USPTO office action against a simulated examiner: multi-turn patent prosecution with retrieval tools, a per-rejection examiner state m...","visibility":"PUBLIC","owner":{"type":"team","name":"prime"},"created_at":"2026-07-06T00:42:42.043000","updated_at":"2026-07-20T01:56:01.956000","tags":["train","eval","legal","patents","multi-turn","tool-use","llm-judge","agentic"],"stars":0,"latest_ci_status":null,"latest_version":"0.1.13"},{"id":"evsmsv92t1uf7id1iqzqbhyp","name":"bodega-env","description":"Bodega — state-verified rewards for browser shopping agents (DOM + CUA)","visibility":"PUBLIC","owner":{"type":"team","name":"davidfromkansas"},"created_at":"2026-07-05T19:36:38.961000","updated_at":"2026-07-06T00:20:47.245000","tags":[],"stars":0,"latest_ci_status":null,"latest_version":"0.1.3"},{"id":"eva3kvfhipavrcchbp6ixqnf","name":"robot-cycle-time","description":"Minimum-cycle-time robot motion scheduling RL environment: an N-joint arm must move rest-to-rest through a synchronized waypoint under per-joint ve...","visibility":"PUBLIC","owner":{"type":"team","name":"jwilksbooth"},"created_at":"2026-07-04T21:19:12.817000","updated_at":"2026-07-08T00:10:14.650000","tags":["single-turn","robotics","optimization","physics-verified","train","eval"],"stars":0,"latest_ci_status":null,"latest_version":"0.1.1"},{"id":"d4hcmf4au49afmz7lm9hdeep","name":"n1-contingency-dispatch","description":"N-1 security-constrained DC-OPF RL environment: the dispatch must survive every non-islanding single-line outage at emergency ratings. The plain OP...","visibility":"PUBLIC","owner":{"type":"team","name":"jwilksbooth"},"created_at":"2026-07-04T18:20:27.341000","updated_at":"2026-07-07T19:03:10.244000","tags":["single-turn","power-systems","optimization","security","train","eval"],"stars":0,"latest_ci_status":null,"latest_version":"0.1.0"},{"id":"f73wlult38owelvya788wm7g","name":"multiperiod-dispatch","description":"Multi-period economic dispatch RL environment with ramp limits (UC-lite): cheap units are slow, fast units are expensive, so per-period merit-order...","visibility":"PUBLIC","owner":{"type":"team","name":"jwilksbooth"},"created_at":"2026-07-04T06:41:15.954000","updated_at":"2026-07-07T04:52:08.269000","tags":["single-turn","power-systems","optimization","multi-period","train","eval"],"stars":0,"latest_ci_status":null,"latest_version":"0.1.2"},{"id":"hql2rvoivkrz8s0cv8lsv9nx","name":"economic-dispatch","description":"Single-turn power-systems economic dispatch RL environment for verifiers / Prime Intellect Environments Hub. Closed-form merit-order ground truth, ...","visibility":"PUBLIC","owner":{"type":"team","name":"jwilksbooth"},"created_at":"2026-07-04T06:40:59.689000","updated_at":"2026-07-07T04:51:52.266000","tags":["single-turn","power-systems","optimization","train","eval"],"stars":0,"latest_ci_status":null,"latest_version":"0.1.3"},{"id":"q039wl93poyivv3zkfcul0un","name":"dcopf-grid-verifiers","description":"DC optimal power flow RL environment with line limits: verifiable dispatch under transmission congestion. Ground truth cross-validated by two indep...","visibility":"PUBLIC","owner":{"type":"team","name":"jwilksbooth"},"created_at":"2026-07-04T06:40:18.613000","updated_at":"2026-07-07T04:52:00.295000","tags":["single-turn","power-systems","optimization","physics-verified","train","eval"],"stars":0,"latest_ci_status":null,"latest_version":"0.2.3"},{"id":"j7520chfo2weuj7l0ljsso57","name":"vc-deal-math","description":"VC deal math (RLVR study Task A): messy funding-round prose -> investor ownership %, graded by the study's pre-registered strict/loose grader","visibility":"PUBLIC","owner":{"type":"user","name":"zachspeck"},"created_at":"2026-07-03T22:50:15.944000","updated_at":"2026-07-03T23:00:16.707000","tags":["math","word-problem","synthetic","single-turn","train","eval"],"stars":0,"latest_ci_status":null,"latest_version":"0.1.0"},{"id":"e2bah1df6zjpyd0ak49p5zie","name":"valid-action","description":"Synthetic corporate-governance legal-agent environment (Prime Intellect).","visibility":"PUBLIC","owner":{"type":"user","name":"narcolepticchicken"},"created_at":"2026-07-03T19:31:53.068000","updated_at":"2026-07-03T19:41:54.948000","tags":["legal","agent","tool-use","synthetic","train","eval"],"stars":0,"latest_ci_status":null,"latest_version":"0.1.1"},{"id":"husnb1i91iu9oot2lltb8v6t","name":"vc-deal-extraction","description":"VC deal extraction (RLVR study Task B): messy funding prose -> 5-field JSON record, graded by the study's pre-registered strict/loose grader","visibility":"PUBLIC","owner":{"type":"user","name":"zachspeck"},"created_at":"2026-07-03T16:27:17.229000","updated_at":"2026-07-03T22:59:24.016000","tags":["extraction","json","synthetic","single-turn","train","eval"],"stars":0,"latest_ci_status":null,"latest_version":"0.1.0"},{"id":"sbvlmts2yamvwr0uhlg7ltvw","name":"football-culture-reasoning","description":"Expert-graded eval of LLM reasoning on football fandom culture: non-Western specificity, stereotype avoidance, macro-sociological nuance. Models pr...","visibility":"PUBLIC","owner":{"type":"user","name":"kablue"},"created_at":"2026-07-03T15:13:47.902000","updated_at":"2026-07-03T15:15:52.788000","tags":["eval","culture","sociology","non-western","stereotype","single-turn","train"],"stars":0,"latest_ci_status":null,"latest_version":"0.1.0"},{"id":"jirt74crjxlzznb7v7wkasys","name":"silent-exit","description":"Reward hacking sprint: a minimal multi-turn typo-fixing env where ending the episode silently is a configurable reward-neutral exit — for studying ...","visibility":"PUBLIC","owner":{"type":"user","name":"dp-learning-rl"},"created_at":"2026-07-03T08:00:24.254000","updated_at":"2026-07-08T05:38:44.241000","tags":["reward-hacking-sprint","multi-turn","train","eval"],"stars":0,"latest_ci_status":null,"latest_version":"0.0.6"},{"id":"zotowlylqyzls5fhfo9646g2","name":"cua-world","description":"CUA-World computer-use benchmark as a verifiers environment: real desktop/mobile apps in cloud VMs (ModalRunner), scored by real per-task verifiers","visibility":"PUBLIC","owner":{"type":"user","name":"pranjal2041"},"created_at":"2026-07-02T21:02:25.436000","updated_at":"2026-07-08T16:49:14.529000","tags":["computer-use","gui","multi-turn","vision","tool-use","multimodal","real-world","train","eval"],"stars":2,"latest_ci_status":null,"latest_version":"0.1.11"},{"id":"ee25ipzxud0ztzjnr3vucwho","name":"doc-edit","description":"Agentic document-editing RL env: propagate changes through a purchase order's dependency graph (formulas + prose echoes) via search/read/edit tools...","visibility":"PUBLIC","owner":{"type":"user","name":"dp-learning-rl"},"created_at":"2026-07-02T18:44:18.194000","updated_at":"2026-07-03T07:05:20.158000","tags":["agent","multi-turn","tool-use","train","eval"],"stars":0,"latest_ci_status":null,"latest_version":"0.1.3"},{"id":"syo8pom8eyb6gmqgdgrxxnfx","name":"pico-dsl-v1","description":"Four-line reverse-then-transduce eval for a tiny per-sample string DSL.","visibility":"PUBLIC","owner":{"type":"user","name":"oso"},"created_at":"2026-07-02T13:35:23.800000","updated_at":"2026-07-02T14:29:36.576000","tags":["eval","single-turn","no-tools","dsl","string-transduction","reverse","real-words"],"stars":0,"latest_ci_status":null,"latest_version":"0.1.0"},{"id":"ffjbd0dud3538rcosa5zbzts","name":"glyph","description":"Verifiable-reward RL environment for a Rust tool-use coding agent: real cargo execution, strict valid_trace metric.","visibility":"PUBLIC","owner":{"type":"user","name":"jayzenith"},"created_at":"2026-07-01T03:32:23.532000","updated_at":"2026-07-02T08:59:47.507000","tags":["rust","tool-use","coding-agent","multi-turn","verifiable-reward"],"stars":0,"latest_ci_status":null,"latest_version":"0.1.1"},{"id":"uhuhj0jqbk2xfjunsph89p68","name":"chess-rlvr","description":"Single-turn chess RLVR environment using FEN, SAN moves, and precomputed Stockfish regret rewards.","visibility":"PUBLIC","owner":{"type":"user","name":"albertklorer"},"created_at":"2026-06-30T20:27:37.065000","updated_at":"2026-07-01T19:16:20.147000","tags":["chess","rlvr","single-turn","stockfish","regret"],"stars":0,"latest_ci_status":null,"latest_version":"0.1.27"},{"id":"v0oxigva4j2rkfrkn47lr0u8","name":"conductor-workflow","description":"Conductor-RL: Multi-agent workflow orchestration environment for GRPO training","visibility":"PUBLIC","owner":{"type":"user","name":"o-taisei"},"created_at":"2026-06-29T15:52:51.035000","updated_at":"2026-06-29T17:56:20.149000","tags":["train","eval","orchestration","multi-agent","grpo"],"stars":0,"latest_ci_status":null,"latest_version":"0.2.4"},{"id":"t18ay6kgi1tqxembua50e4i5","name":"pico-dsl-v0","description":"Single-turn, no tool, nothink eval for executing a tiny per-sample string transduction DSL.","visibility":"PUBLIC","owner":{"type":"user","name":"oso"},"created_at":"2026-06-29T02:55:43.561000","updated_at":"2026-07-02T23:34:12.744000","tags":["eval","single-turn","no-tools","dsl","string-transduction"],"stars":0,"latest_ci_status":null,"latest_version":"0.1.0"},{"id":"topzz3685p9ckmljvadepn6b","name":"set-diagnosis-tool-call","description":"Environment for evaluating clinical diagnosis assignment (ICD-10) via tool calls on clinical case reports. Scores ICD-10 code and acute/chronic sta...","visibility":"PUBLIC","owner":{"type":"user","name":"mkurman"},"created_at":"2026-06-27T23:05:11.386000","updated_at":"2026-07-07T21:35:45.637000","tags":["tool-calling","medical","function-calling","icd-10"],"stars":0,"latest_ci_status":null,"latest_version":"0.2.0"},{"id":"xjlwo9762nrydeaef5y36xls","name":"benchcad-qa","description":"BenchCAD QA — rendered views of a mechanical part -> a numeric answer, scored by symmetric ratio accuracy (no judge, no CAD execution).","visibility":"PUBLIC","owner":{"type":"team","name":"benchcad"},"created_at":"2026-06-27T08:39:39.151000","updated_at":"2026-06-27T08:39:42.954000","tags":["eval","vision","cad","multimodal","vqa"],"stars":0,"latest_ci_status":null,"latest_version":"0.1.0"},{"id":"lc19wcq93299aonus63xp8jn","name":"benchcad-vision2code","description":"BenchCAD Vision2Code — image of a mechanical part -> CadQuery program, scored by voxel IoU on the executed STEP solid (no judge).","visibility":"PUBLIC","owner":{"type":"team","name":"benchcad"},"created_at":"2026-06-27T08:39:24.937000","updated_at":"2026-06-27T08:39:28.492000","tags":["eval","vision","code","cad","multimodal"],"stars":0,"latest_ci_status":null,"latest_version":"0.1.0"},{"id":"vxht0b8tzz3gcy05crms94ax","name":"tycoon-learning-environment","description":"Prime/Verifiers multi-turn environment for TycoonLE logistics planning.","visibility":"PUBLIC","owner":{"type":"user","name":"vrtnis"},"created_at":"2026-06-27T01:32:05.902000","updated_at":"2026-06-27T01:41:23.940000","tags":["eval","rl","planning","logistics","multi-turn"],"stars":0,"latest_ci_status":null,"latest_version":"0.1.2"},{"id":"alb6naq0bx8ct148xe2ix1cm","name":"harvey-lab-rlm","description":"A Verifiers-compatible recursive language model environment for Harvey's Legal Agent Benchmark.","visibility":"PUBLIC","owner":{"type":"user","name":"irfanjamil"},"created_at":"2026-06-25T21:21:56.696000","updated_at":"2026-07-04T16:49:49.690000","tags":["legal","rlm","multi-turn","train","eval"],"stars":0,"latest_ci_status":null,"latest_version":"0.1.5"},{"id":"ff15ltqyafhhtnpnfufyadn1","name":"harvey-lab-classic","description":"A Verifiers implementation of Harvey LAB as a multi-turn Prime sandbox tool environment.","visibility":"PUBLIC","owner":{"type":"user","name":"irfanjamil"},"created_at":"2026-06-25T21:21:01.649000","updated_at":"2026-07-04T16:50:02.293000","tags":["legal","tools","sandbox","multi-turn","train","eval"],"stars":0,"latest_ci_status":null,"latest_version":"0.1.6"},{"id":"ficr6084dtwio1czxnq7qzpr","name":"economic-crisis-benchmark","description":"An anonymized macroeconomic crisis prediction benchmark for Prime Intellect Environments.","visibility":"PUBLIC","owner":{"type":"team","name":"stlanik95"},"created_at":"2026-06-25T13:57:10.008000","updated_at":"2026-06-25T14:08:09.525000","tags":["macroeconomics","benchmark","crisis","train","eval"],"stars":1,"latest_ci_status":null,"latest_version":"0.2.2"},{"id":"dn5gofg2tepaufawtdyl67x0","name":"cree1865-dictionary-qa","description":"Prime Verifiers environment for Watkins 1865 Cree dictionary QA tasks.","visibility":"PUBLIC","owner":{"type":"user","name":"harleycooper"},"created_at":"2026-06-25T12:43:05.387000","updated_at":"2026-06-25T12:51:35.542000","tags":["single-turn","cree","dictionary","qa","translation","rl"],"stars":0,"latest_ci_status":null,"latest_version":"0.1.2"},{"id":"ugzqnox76hop2tpikpte27q6","name":"vf-v1-test","description":"vf-v1-test — a tiny self-contained v1 taskset for testing the push/install roundtrip.","visibility":"PUBLIC","owner":{"type":"user","name":"mikasenghaas"},"created_at":"2026-06-24T01:52:03.821000","updated_at":"2026-06-24T02:26:53.496000","tags":[],"stars":0,"latest_ci_status":null,"latest_version":"0.1.1"},{"id":"sxv87cedz5lvgditc12v08x6","name":"nexum-deal-folder-test","description":"Commercial real estate debt-scenario eval using anonymized PDF guidance and a native Excel underwriting model.","visibility":"PUBLIC","owner":{"type":"team","name":"iceberg"},"created_at":"2026-06-22T18:40:09.786000","updated_at":"2026-06-22T18:40:13.528000","tags":["real-estate","excel","pdf","tool-use","finance","eval"],"stars":0,"latest_ci_status":null,"latest_version":"0.1.0"},{"id":"w3n08io55s9qogj5ljwvs8tc","name":"linkd-search-llm","description":"Verifiers environment: REAL natural-language people-search query -> MongoDB find filter, scored by an LLM judge over LIVE Berkeley.profilematch ret...","visibility":"PUBLIC","owner":{"type":"team","name":"freesolo-co"},"created_at":"2026-06-22T08:50:09.169000","updated_at":"2026-06-22T08:54:30.904000","tags":[],"stars":0,"latest_ci_status":null,"latest_version":"0.1.1"},{"id":"kbtyd1ge1gvw8t4muro3bl4g","name":"python-coding-pro","description":"HumanEval Python code-gen with sandboxed unit-test execution, timeouts, import-safety, partial credit, and a syntactic-validity metric. For RL & eval.","visibility":"PUBLIC","owner":{"type":"user","name":"mohamed313"},"created_at":"2026-06-21T08:44:19.702000","updated_at":"2026-06-21T09:23:11.578000","tags":["placeholder-tag","train","eval"],"stars":0,"latest_ci_status":null,"latest_version":"0.1.0"},{"id":"lmalfb4lrcu3x33lvyrqfmt8","name":"ru-knowledge","description":"Russian-language knowledge & reasoning multiple-choice eval (single-turn, self-contained dataset)","visibility":"PUBLIC","owner":{"type":"user","name":"alextyt"},"created_at":"2026-06-21T08:42:14.126000","updated_at":"2026-06-21T08:42:19.053000","tags":["eval","russian","multiple-choice","knowledge"],"stars":0,"latest_ci_status":null,"latest_version":"0.1.1"},{"id":"h60v7y9xfbu07z8ldziybqro","name":"python-coding-eval","description":"HumanEval Python code generation. Model completes a function, run against unit tests in an isolated subprocess. Pass/fail reward. Lightweight baseline","visibility":"PUBLIC","owner":{"type":"user","name":"mohamed313"},"created_at":"2026-06-21T08:30:51.318000","updated_at":"2026-06-21T09:23:29.002000","tags":["placeholder-tag","train","eval"],"stars":0,"latest_ci_status":null,"latest_version":"0.1.0"},{"id":"tf99d436ea7vky7qe31b3jo5","name":"supersede","description":"Bounded-memory supersession environment: train/eval agents to use the current fact, not the stale one, across long multi-session interactions.","visibility":"PUBLIC","owner":{"type":"team","name":"vedant"},"created_at":"2026-06-20T20:45:40.205000","updated_at":"2026-06-24T06:57:51.123000","tags":["multi-turn","memory","long-horizon","qa","train","eval"],"stars":0,"latest_ci_status":null,"latest_version":"0.2.3"},{"id":"b0rukpj2av13htpxz9fwqtik","name":"spec-faithfulness","description":"Mechanically-checked Verus specification-faithfulness environment for `verifiers`: a spec must accept the correct implementation and reject buggy o...","visibility":"PUBLIC","owner":{"type":"user","name":"kit-kyo"},"created_at":"2026-06-20T06:21:27.517000","updated_at":"2026-06-20T06:21:31.106000","tags":[],"stars":0,"latest_ci_status":null,"latest_version":"0.1.0"},{"id":"xnsfgvxknld6qc2z2czfu8t5","name":"lean-verifier","description":"Anti-gameable Lean 4 theorem-proving environment for `verifiers`, rewarded by a real Kimina/Lean proof check (on-prem, GPU-free).","visibility":"PUBLIC","owner":{"type":"user","name":"kit-kyo"},"created_at":"2026-06-20T05:59:28.367000","updated_at":"2026-06-20T07:47:14.484000","tags":[],"stars":0,"latest_ci_status":null,"latest_version":"0.1.2"},{"id":"mfuuf8yka6wx8ilsrjm1lakk","name":"vending-bench","description":"Your environment description here","visibility":"PUBLIC","owner":{"type":"user","name":"sanatmouli"},"created_at":"2026-06-20T03:10:08.208000","updated_at":"2026-06-20T03:10:44.516000","tags":["placeholder-tag","train","eval"],"stars":0,"latest_ci_status":null,"latest_version":"0.1.0"},{"id":"tlvnjnlbnkoctv9vpe2974o6","name":"judge-cascade-fooled","description":"Reward-hacking env: persuasion / oversight-evasion. A cheap student answers gold-verifiable but subjective-seeming MCQs (TruthfulQA); a frozen LLM ...","visibility":"PUBLIC","owner":{"type":"user","name":"nekoeiha"},"created_at":"2026-06-19T06:23:10.565000","updated_at":"2026-06-23T19:39:34.163000","tags":["train","eval","reward-hacking","persuasion","oversight","judge","calibration","cascade","single-turn"],"stars":0,"latest_ci_status":null,"latest_version":"0.1.12"},{"id":"zbyzv64g2eluaujhi5n1zp5i","name":"judge-cascade-esc","description":"Reward-hacking env: persuasion / oversight-evasion. A cheap student answers gold-verifiable but subjective-seeming MCQs (TruthfulQA); a frozen LLM ...","visibility":"PUBLIC","owner":{"type":"user","name":"nekoeiha"},"created_at":"2026-06-19T06:23:04.513000","updated_at":"2026-06-20T21:18:21.138000","tags":["train","eval","reward-hacking","persuasion","oversight","judge","calibration","cascade","single-turn"],"stars":0,"latest_ci_status":null,"latest_version":"0.1.3"},{"id":"r85tupxsfng6rcb848a3nsbf","name":"judge-cascade-jscore","description":"Reward-hacking env: persuasion / oversight-evasion. A cheap student answers gold-verifiable but subjective-seeming MCQs (TruthfulQA); a frozen LLM ...","visibility":"PUBLIC","owner":{"type":"user","name":"nekoeiha"},"created_at":"2026-06-19T06:22:59.014000","updated_at":"2026-06-23T19:39:44.417000","tags":["train","eval","reward-hacking","persuasion","oversight","judge","calibration","cascade","single-turn"],"stars":0,"latest_ci_status":null,"latest_version":"0.1.12"},{"id":"vo2ycr4kmgj5mwk7zhw0jtsf","name":"judge-cascade-gold","description":"Reward-hacking env: persuasion / oversight-evasion. A cheap student answers gold-verifiable but subjective-seeming MCQs (TruthfulQA); a frozen LLM ...","visibility":"PUBLIC","owner":{"type":"user","name":"nekoeiha"},"created_at":"2026-06-19T06:22:53.008000","updated_at":"2026-06-23T19:39:39.187000","tags":["train","eval","reward-hacking","persuasion","oversight","judge","calibration","cascade","single-turn"],"stars":0,"latest_ci_status":null,"latest_version":"0.1.12"},{"id":"c7julquu7n384rgep9uzt7ot","name":"judge-cascade","description":"Reward-hacking env: persuasion / oversight-evasion. A cheap student answers gold-verifiable but subjective-seeming MCQs (TruthfulQA); a frozen LLM ...","visibility":"PUBLIC","owner":{"type":"user","name":"nekoeiha"},"created_at":"2026-06-19T05:48:37.913000","updated_at":"2026-06-23T19:39:15.937000","tags":["train","eval","reward-hacking","persuasion","oversight","judge","calibration","cascade","single-turn"],"stars":0,"latest_ci_status":null,"latest_version":"0.1.12"},{"id":"bmlg7jkpjn4ha6e9gsov727e","name":"quest-deepresearch","description":"Run OSU-NLP QUEST's released RL tasks + rubric-tree rewards without their withheld cached corpus (live-fetch reproducibility bridge)","visibility":"PUBLIC","owner":{"type":"user","name":"fa1zvn"},"created_at":"2026-06-19T00:44:33.831000","updated_at":"2026-06-19T00:44:44.854000","tags":["multi-turn","tools","deep-research","agent-as-judge","eval","train"],"stars":0,"latest_ci_status":null,"latest_version":"0.1.0"},{"id":"nsmnvrxsv7c5sc6bregcgypa","name":"redline-v2","description":"Multi-term contract negotiation with a verifiable, Pareto-frontier reward (no judge)","visibility":"PUBLIC","owner":{"type":"user","name":"fa1zvn"},"created_at":"2026-06-18T21:40:19.823000","updated_at":"2026-06-23T20:32:13.196000","tags":["multi-turn","negotiation","legal","self-play","train","eval"],"stars":0,"latest_ci_status":null,"latest_version":"0.2.3"},{"id":"nt0g5oxddrok56wo3c3bawku","name":"mind2web-grpo","description":"Mind2Web web-action prediction with swappable binary vs shaped GRPO reward","visibility":"PUBLIC","owner":{"type":"user","name":"fa1zvn"},"created_at":"2026-06-18T18:53:47.219000","updated_at":"2026-06-18T19:43:10.889000","tags":["web-agents","single-turn","reward-shaping","train","eval"],"stars":0,"latest_ci_status":null,"latest_version":"0.1.1"},{"id":"j37kiw3frmban4kkiqy8je89","name":"web-search-env","description":"Multi-turn web-search QA environment with Exa-style benchmark support","visibility":"PUBLIC","owner":{"type":"team","name":"prime"},"created_at":"2026-06-18T01:25:09.402000","updated_at":"2026-06-26T18:03:51.105000","tags":["search","qa","tool-use","train","eval"],"stars":0,"latest_ci_status":null,"latest_version":"0.1.12"},{"id":"x2xblp7hfm574el9o2ubain8","name":"channel-switching-eval","description":"Measures where a model's evaluation-context coupling lives: behavioral channel I(C;Y_b) vs reasoning channel I(C;Y_r), under benign policy-conflict...","visibility":"PUBLIC","owner":{"type":"user","name":"anthone"},"created_at":"2026-06-17T17:58:37.823000","updated_at":"2026-06-17T17:58:40.955000","tags":["eval","train","safety","alignment","information-theory","single-turn","deception","eval-awareness"],"stars":0,"latest_ci_status":null,"latest_version":"0.1.1"}],"status":null}