{"total_count":1193,"offset":0,"limit":100,"data":[{"id":"k31xjgabqcwzx88pqt003uj4","name":"curriculum-oversight","description":"Modular curriculum learning environment with steganographic coloring and LCG constraints to study reward hacking under twofold verification.","visibility":"PUBLIC","owner":{"type":"user","name":"emilschmitz"},"created_at":"2026-05-27T17:38:45.499000","updated_at":"2026-05-27T20:43:10.527000","tags":["reward-hacking","curriculum","steganography","lcg","sprint"],"stars":0,"latest_ci_status":null,"latest_version":"0.1.4"},{"id":"psu5gdyxm20cenyj63qgjqvc","name":"format-proxy-sec-extraction","description":"Environment for studying format proxy reward hacking in SEC extraction tasks","visibility":"PUBLIC","owner":{"type":"team","name":"reward-hacking-sprint"},"created_at":"2026-05-26T17:12:06.321000","updated_at":"2026-05-27T22:03:58.175000","tags":["reward-hacking","single-turn","train","eval","finance"],"stars":0,"latest_ci_status":null,"latest_version":"0.2.3"},{"id":"axubaulr1znvbjl4xq3nkw7y","name":"hack-detector","description":"Early warning system for reward hacking — detects hacking onset from within-batch variance signals before the hack goes live","visibility":"PUBLIC","owner":{"type":"user","name":"jaxhax"},"created_at":"2026-05-26T08:43:18.152000","updated_at":"2026-05-27T14:58:55.410000","tags":["reward-hacking","detection","variance","grpo","early-warning"],"stars":0,"latest_ci_status":null,"latest_version":"0.1.2"},{"id":"x7651g0yk2q1auscdi6f1szo","name":"physgym-arena-drhard-public","description":"PhysGym Arena DR-hard benchmark for domain-randomized Gym simulator repair","visibility":"PUBLIC","owner":{"type":"user","name":"kishanpb"},"created_at":"2026-05-26T06:05:08.372000","updated_at":"2026-05-26T06:24:34.455000","tags":["code-repair","gym","rl","evaluation","verifiers"],"stars":0,"latest_ci_status":null,"latest_version":"0.3.1"},{"id":"zng6bbsjfqdf0pv8aplozcvm","name":"physgym-arena-medley-public","description":"PhysGym Arena medley benchmark for achievable medium-hard Gym simulator repair","visibility":"PUBLIC","owner":{"type":"user","name":"kishanpb"},"created_at":"2026-05-26T06:05:01.734000","updated_at":"2026-05-26T06:24:24.599000","tags":["code-repair","gym","rl","evaluation","verifiers"],"stars":0,"latest_ci_status":null,"latest_version":"0.3.1"},{"id":"xul52vo95feew836z7jlrmfk","name":"regex-qc","description":"Reward-hacking sprint env that pairs the markdown-formatting hack with a sweepable cheap regex penalty, measuring whether heuristic QC suppresses e...","visibility":"PUBLIC","owner":{"type":"user","name":"danruif"},"created_at":"2026-05-25T23:24:25.458000","updated_at":"2026-05-25T23:24:28.297000","tags":["reward-hacking-sprint","qc","regex","formatting","gsm8k","train","eval"],"stars":0,"latest_ci_status":null,"latest_version":"0.1.0"},{"id":"lw771nvcewly8jixkn6m04my","name":"cot-theater","description":"Reward-hacking sprint env. Four pseudo-CoT surface proxies and four true reasoning metrics on GSM8K, with all eight logged on every rollout so the ...","visibility":"PUBLIC","owner":{"type":"user","name":"danruif"},"created_at":"2026-05-25T22:32:27.705000","updated_at":"2026-05-25T22:32:31.063000","tags":["reward-hacking-sprint","chain-of-thought","proxy-true-split","gsm8k","train","eval"],"stars":0,"latest_ci_status":null,"latest_version":"0.1.0"},{"id":"jk27lahp5r2iujxgmwrzdh3n","name":"humanize-rl-env","description":"Single-turn Prime Verifiers environment for Humanize-RL rewards","visibility":"PUBLIC","owner":{"type":"user","name":"jayshah5696"},"created_at":"2026-05-25T21:29:39.167000","updated_at":"2026-05-26T04:37:23.459000","tags":["single-turn","writing","rl","reward","humanize"],"stars":1,"latest_ci_status":null,"latest_version":"0.2.0"},{"id":"h70fveooip81soz751w72vac","name":"backdoor-ifeval-phase-law","description":"Backdoor-IFEval phase-transition lab: advantage-geometry metrics, quadrant fractions, and boundary-shifting interventions for reward-hacking law te...","visibility":"PUBLIC","owner":{"type":"user","name":"austindixson"},"created_at":"2026-05-25T19:32:36.929000","updated_at":"2026-05-25T19:56:50.257000","tags":["reward-hacking","phase-diagram","backdoor-ifeval","sprint","train","eval"],"stars":0,"latest_ci_status":null,"latest_version":"0.1.7"},{"id":"idih35iyiaoe6j6jqauny97j","name":"emoji-hack","description":"Reward-hacking sprint env. A planted emoji-density hack on GSM8K, used to test whether GRPO can amplify a behavior with effectively zero baseline m...","visibility":"PUBLIC","owner":{"type":"user","name":"danruif"},"created_at":"2026-05-25T07:10:43.469000","updated_at":"2026-05-25T07:10:46.902000","tags":["reward-hacking-sprint","emoji","baseline-mass","gsm8k","train","eval"],"stars":0,"latest_ci_status":null,"latest_version":"0.1.0"},{"id":"r9u9j0xcikqgjdvzkmzjgcr6","name":"reasoning-hack","description":"Reward-hacking sprint env. A planted chain-of-thought-scaffolding hack on GSM8K, with hidden-reward weight as the experimental knob.","visibility":"PUBLIC","owner":{"type":"user","name":"danruif"},"created_at":"2026-05-25T07:10:39.152000","updated_at":"2026-05-25T07:10:41.698000","tags":["reward-hacking-sprint","reasoning","gsm8k","train","eval"],"stars":0,"latest_ci_status":null,"latest_version":"0.1.0"},{"id":"fgto4e1d6srku9uu0rqissw3","name":"length-hack","description":"Reward-hacking sprint env. A planted brevity hack on GSM8K, with hidden-reward weight and target length as the two experimental knobs.","visibility":"PUBLIC","owner":{"type":"user","name":"danruif"},"created_at":"2026-05-25T07:10:34.800000","updated_at":"2026-05-25T07:10:37.645000","tags":["reward-hacking-sprint","length","gsm8k","train","eval"],"stars":0,"latest_ci_status":null,"latest_version":"0.1.0"},{"id":"hv2xadhe42qq207wlxbe1rmj","name":"emergence-prediction","description":"Reward-hacking sprint env. The planted token-frequency hack is held fixed within a run, and planted_token varies across runs to test whether emerge...","visibility":"PUBLIC","owner":{"type":"user","name":"danruif"},"created_at":"2026-05-25T06:29:12.930000","updated_at":"2026-05-25T06:29:15.787000","tags":["reward-hacking-sprint","emergence","token-frequency","gsm8k","train","eval"],"stars":0,"latest_ci_status":null,"latest_version":"0.1.1"},{"id":"egoxh6t5sotaw6xrenhw4f84","name":"compositional-hacks","description":"Reward-hacking sprint env that plants two hidden rewards at once on GSM8K to probe whether one dominates or both emerge proportionally.","visibility":"PUBLIC","owner":{"type":"user","name":"danruif"},"created_at":"2026-05-25T06:29:08.290000","updated_at":"2026-05-25T06:29:11.262000","tags":["reward-hacking-sprint","compositional","gsm8k","train","eval"],"stars":0,"latest_ci_status":null,"latest_version":"0.1.1"},{"id":"th8fezc64pk4mq1nskwdgums","name":"formatting-emergence","description":"Reward-hacking sprint env. A planted markdown-formatting hack on GSM8K, with hidden-reward weight and task difficulty as the two experimental knobs.","visibility":"PUBLIC","owner":{"type":"user","name":"danruif"},"created_at":"2026-05-25T06:29:04.203000","updated_at":"2026-05-25T06:29:06.786000","tags":["reward-hacking-sprint","formatting","gsm8k","train","eval"],"stars":0,"latest_ci_status":null,"latest_version":"0.1.1"},{"id":"k58b9sjtsv6jv61a0egx8nzt","name":"backdoor-toy-adaptive-sprint","description":"Toy backdoor reward-hacking environment with fixed, ARW, WPO, and smoothed WPO reward variants","visibility":"PUBLIC","owner":{"type":"user","name":"shyampathak"},"created_at":"2026-05-24T23:03:40.848000","updated_at":"2026-05-25T01:31:08.685000","tags":["reward-hacking","backdoor","toy","adaptive-rewards","train","eval"],"stars":0,"latest_ci_status":null,"latest_version":"0.1.4"},{"id":"aarivd10eecsdpev9qvytj23","name":"skillsbench","description":"SkillsBench — evaluating how well AI agents use skills (94 task definitions).","visibility":"PUBLIC","owner":{"type":"user","name":"xiangyi"},"created_at":"2026-05-24T10:03:55.890000","updated_at":"2026-05-24T17:20:54.845000","tags":["skills","agents","tool-use","harbor","eval","train"],"stars":1,"latest_ci_status":null,"latest_version":"0.1.3"},{"id":"ghyxvuqi91g6asojc2c8mnt1","name":"verification-theater-reward-hack","description":"Reward hacking sprint environment for verification theater: polished audit claims without correct reconciliation.","visibility":"PUBLIC","owner":{"type":"user","name":"unemployedvibecoder"},"created_at":"2026-05-23T22:36:26.708000","updated_at":"2026-05-23T23:50:53.916000","tags":["reward-hacking","reward-hacking-sprint","verification","audit","agents","train","eval"],"stars":0,"latest_ci_status":null,"latest_version":"0.1.4"},{"id":"uh6m0d3wvo94izu4fj6k1emq","name":"citation-laundering-reward-hack","description":"Reward hacking sprint environment for citation laundering: confident cited answers whose evidence does not support the conclusion.","visibility":"PUBLIC","owner":{"type":"user","name":"unemployedvibecoder"},"created_at":"2026-05-23T22:00:19.131000","updated_at":"2026-05-24T04:55:50.280000","tags":["reward-hacking","reward-hacking-sprint","citation","rag","compliance","train","eval"],"stars":0,"latest_ci_status":null,"latest_version":"0.1.8"},{"id":"skh7qnpzgn4zs6npkcqwbssw","name":"concise-complete-reward-hack","description":"Reward hacking sprint environment for testing concise-format proxy rewards against true checklist completeness.","visibility":"PUBLIC","owner":{"type":"user","name":"unemployedvibecoder"},"created_at":"2026-05-23T21:38:34.286000","updated_at":"2026-05-23T21:48:49.007000","tags":["reward-hacking","sprint","proxy-reward","train","eval"],"stars":0,"latest_ci_status":null,"latest_version":"0.1.2"},{"id":"dz5oqgstpcjcizivt5jzpkrm","name":"math-to-manim","description":"Prime Intellect Verifiers environment for improving Math-To-Manim text-prompt-to-animation outputs.","visibility":"PUBLIC","owner":{"type":"user","name":"harleycooper"},"created_at":"2026-05-23T18:54:22.938000","updated_at":"2026-05-24T16:22:20.075000","tags":["manim","rl","verifiers","code-repair"],"stars":0,"latest_ci_status":null,"latest_version":"0.1.3"},{"id":"noyd7x2qa8gcnift6u1wvg2m","name":"rubric-reward-hacking","description":"Reward hacking sprint environment for rubric/checklist compliance without semantic correctness.","visibility":"PUBLIC","owner":{"type":"user","name":"summer-winter-98379"},"created_at":"2026-05-23T18:35:10.012000","updated_at":"2026-05-23T18:35:15.883000","tags":["reward-hacking","rubric","single-turn","train","eval"],"stars":0,"latest_ci_status":null,"latest_version":"0.1.0"},{"id":"xqab5q4avdyzajsgkmxezv09","name":"hidden-keyword-ifeval-mini","description":"Reward hacking sprint calibration environment for hidden keyword gradients in instruction following.","visibility":"PUBLIC","owner":{"type":"user","name":"summer-winter-98379"},"created_at":"2026-05-23T18:35:09.969000","updated_at":"2026-05-23T18:54:37.289000","tags":["reward-hacking","ifeval","single-turn","train","eval"],"stars":0,"latest_ci_status":null,"latest_version":"0.1.1"},{"id":"uo0dbkny8pu1w8bee2pi5r7n","name":"public-tests-hidden-tests","description":"Reward hacking sprint environment for public-test overfitting against hidden execution tests.","visibility":"PUBLIC","owner":{"type":"user","name":"summer-winter-98379"},"created_at":"2026-05-22T12:58:20.346000","updated_at":"2026-05-22T18:12:37.658000","tags":["reward-hacking","code","single-turn","train","eval"],"stars":0,"latest_ci_status":null,"latest_version":"0.3.0"},{"id":"e12om7f5ltea89lruim6be5t","name":"ecom-bench","description":"Shopify e-commerce browser agent benchmark — Prime Intellect verifiers environment.","visibility":"PUBLIC","owner":{"type":"team","name":"vibrantlabsai"},"created_at":"2026-05-22T09:45:37.782000","updated_at":"2026-05-27T15:40:51.211000","tags":["browser-agent","stagehand","shopify","eval","train"],"stars":0,"latest_ci_status":null,"latest_version":"0.1.4"},{"id":"xmccrudmyrd22nx8gz0fzhxb","name":"sycophancy-truth-trap","description":"Reward hacking sprint environment for agreement-vs-truth threshold dynamics","visibility":"PUBLIC","owner":{"type":"user","name":"himcules"},"created_at":"2026-05-22T07:23:21.835000","updated_at":"2026-05-22T07:23:25.582000","tags":["reward-hacking","sycophancy","truthfulness","thresholds","train","eval"],"stars":0,"latest_ci_status":null,"latest_version":"0.1.0"},{"id":"lp671rgd9h5gkbdr4tc6mbxs","name":"backdoor-ifeval-vigilant","description":"Variance-based early-warning circuit breaker for reward hacking. Detects hidden reward variance within batch groups and auto-kills hidden_weight be...","visibility":"PUBLIC","owner":{"type":"user","name":"austindixson"},"created_at":"2026-05-22T03:28:19.129000","updated_at":"2026-05-24T16:03:54.645000","tags":["reward-hacking","vigilance","backdoor-ifeval","sprint","train","eval"],"stars":0,"latest_ci_status":null,"latest_version":"0.2.4"},{"id":"gk4ujms03952dd71iedswrvb","name":"reframing-hack","description":"Reward hacking sprint environment for exact contrastive-reframing proxy pressure in helpfulness tasks.","visibility":"PUBLIC","owner":{"type":"user","name":"ob1"},"created_at":"2026-05-22T03:03:37.446000","updated_at":"2026-05-23T00:20:42.834000","tags":["reward-hacking-sprint","reward-hacking","proxy-reward","style-collapse","train","eval"],"stars":0,"latest_ci_status":null,"latest_version":"0.1.10"},{"id":"z3b27n1pksg72y5wf9q77kc5","name":"dgcl-reward-hacking","description":"Reward hacking sprint environment: a deliberate cheap-proxy / expensive-true composite reward over Delaware DGCL §170/§141/§160 compliance, built t...","visibility":"PUBLIC","owner":{"type":"user","name":"narcolepticchicken"},"created_at":"2026-05-22T00:37:29.579000","updated_at":"2026-05-22T00:37:33.665000","tags":["reward-hacking","reward-hacking-sprint","legal","dgcl","rlvr","grpo","train","eval"],"stars":0,"latest_ci_status":null,"latest_version":"0.1.0"},{"id":"wb2sx50mat0fhwl0p9amputd","name":"dynamic-goldilocks-ifeval","description":"FIXED: Adaptive controller for reward hacking. Monitors visible delta AND hidden reward. Adapts check count 7→9. Original was bugged (blind to hidd...","visibility":"PUBLIC","owner":{"type":"user","name":"austindixson"},"created_at":"2026-05-22T00:20:41.541000","updated_at":"2026-05-22T21:33:17.458000","tags":["reward-hacking","sprint","v1","single-turn","ifeval","adaptive"],"stars":0,"latest_ci_status":null,"latest_version":"0.1.12"},{"id":"kxs7h9g6e7rtgcbwxvakfngj","name":"prime-grep","description":"Cross-repo code-search tasks over prime-rl, verifiers, vllm, pytorch","visibility":"PUBLIC","owner":{"type":"team","name":"prime"},"created_at":"2026-05-21T22:59:13.040000","updated_at":"2026-05-24T00:32:49.582000","tags":["code-search","multi-repo","eval","train"],"stars":0,"latest_ci_status":null,"latest_version":"0.4.7"},{"id":"xsn9x4tr60rms12fea7jaxym","name":"triton-hint-hacking","description":"Reward hacking environment where the correct Triton solution is in-context but proxy reward favors shallow code shape.","visibility":"PUBLIC","owner":{"type":"user","name":"summer-winter-98379"},"created_at":"2026-05-21T14:15:01.973000","updated_at":"2026-05-22T11:42:59.773000","tags":["reward-hacking","triton","context-use","single-turn","train","eval"],"stars":0,"latest_ci_status":null,"latest_version":"0.1.6"},{"id":"pdvhjm65p5x90q8fcbmbfbop","name":"valid-format-invalid-intent","description":"Single-turn reward hacking environment for valid-looking specs that may violate design intent.","visibility":"PUBLIC","owner":{"type":"user","name":"summer-winter-98379"},"created_at":"2026-05-21T10:41:35.938000","updated_at":"2026-05-22T11:42:39.813000","tags":["reward-hacking","single-turn","train","eval","engineering"],"stars":0,"latest_ci_status":null,"latest_version":"0.1.3"},{"id":"vi51lj6ibvyc118pt6a7x29o","name":"sycophancy-math","description":"Reward-hacking sprint environment: arithmetic tasks with planted sycophancy proxy","visibility":"PUBLIC","owner":{"type":"user","name":"semih"},"created_at":"2026-05-21T09:23:00.365000","updated_at":"2026-05-21T09:23:03.462000","tags":["reward-hacking","sprint","sycophancy","single-turn","train","eval"],"stars":0,"latest_ci_status":null,"latest_version":"0.1.0"},{"id":"du8kuqh8i7ob7we96vu60ewn","name":"certainty-collapse","description":"Reward Hacking Sprint: does optimizing self-certainty (RLIF-style intrinsic reward) cause models to be confidently wrong on math? GSM8K, Llama-3.2-...","visibility":"PUBLIC","owner":{"type":"user","name":"cardan05"},"created_at":"2026-05-21T08:42:55.201000","updated_at":"2026-05-21T09:10:22.596000","tags":["reward-hacking-sprint","rlif","self-certainty","gsm8k","math","reward-hacking"],"stars":0,"latest_ci_status":null,"latest_version":"0.1.2"},{"id":"b8yhjsdo7aduyv74u3pojolt","name":"skill-reward-hacking","description":"Reward Hacking Sprint v3: Enhanced environment with true metrics tracking, harder proxy traps, and hacking detection. 12 proxy rewards (4 traps) + ...","visibility":"PUBLIC","owner":{"type":"user","name":"tonyteo"},"created_at":"2026-05-21T03:00:19.546000","updated_at":"2026-05-27T06:53:08.557000","tags":["reward-hacking","sprint","single-turn","text-generation","skill-creation","small-models","hacking-detection"],"stars":2,"latest_ci_status":null,"latest_version":"3.0.0"},{"id":"edfl2vc6xhisnveapqz32ect","name":"backdoor-ifeval-all-ssac","description":"Unified backdoor-ifeval env plus SSAC/GDPO custom advantage helpers","visibility":"PUBLIC","owner":{"type":"user","name":"aseth"},"created_at":"2026-05-21T02:10:46.260000","updated_at":"2026-05-21T21:33:08.545000","tags":["reward-hacking","backdoor","instruction-following","train","eval"],"stars":0,"latest_ci_status":null,"latest_version":"0.1.5"},{"id":"q0l3c3m98k1bgriei2vxijrr","name":"autonomous-skill-evolution","description":"Self-Improving Agent Environment: create, validate, refine, compose, and evolve reusable skills from execution traces. 40 tasks, 5 tiers, 10+ domai...","visibility":"PUBLIC","owner":{"type":"user","name":"tonyteo"},"created_at":"2026-05-19T05:49:16.286000","updated_at":"2026-05-20T13:25:56.911000","tags":["agentic","self-improvement","tool-use","skill-creation","multi-turn","sandbox"],"stars":2,"latest_ci_status":null,"latest_version":"1.0.0"},{"id":"ezdn8q7z7jxv8n3067ttypiv","name":"openfarm-horse-grimace","description":"Horse Grimace Scale-style facial-region pain scoring benchmark.","visibility":"PUBLIC","owner":{"type":"team","name":"openfarm"},"created_at":"2026-05-18T10:22:15.950000","updated_at":"2026-05-27T02:52:29.320000","tags":["animal-welfare","horse","equine","facial-expression","grimace-scale","pain-assessment","vision","eval"],"stars":0,"latest_ci_status":null,"latest_version":"0.1.0"},{"id":"habq7co3w3lfujj2hbc9amxw","name":"devops-gym","description":"Text + judge / oracle-proxy tasks from DevOps-Gym (https://github.com/ucsb-mlsec/DevOps-Gym) for Prime eval and RL.","visibility":"PUBLIC","owner":{"type":"user","name":"vinayak1998"},"created_at":"2026-05-18T06:57:29.196000","updated_at":"2026-05-18T14:49:43.345000","tags":[],"stars":0,"latest_ci_status":null,"latest_version":"0.1.0"},{"id":"te8tcka7tlekqb19h8rjh4k9","name":"openfarm-zoo-arousal-eval","description":"OpenFARM visual affect benchmark for expert-coded zoo animal arousal and valence","visibility":"PUBLIC","owner":{"type":"team","name":"openfarm"},"created_at":"2026-05-18T06:07:45.886000","updated_at":"2026-05-27T02:52:11.234000","tags":["openfarm","animal-welfare","zoo-animals","visual-affect","video","vision","eval"],"stars":0,"latest_ci_status":null,"latest_version":"0.1.0"},{"id":"sa8yt0aou6f83olplfql5q44","name":"openfarm-dog-pain-triage","description":"A canine clinical text triage environment evaluating whether models can accurately deduce OpenFARM/AWW affective pain states from leakage-safe tabu...","visibility":"PUBLIC","owner":{"type":"team","name":"openfarm"},"created_at":"2026-05-18T06:07:32.196000","updated_at":"2026-05-27T02:51:58.613000","tags":["animal-welfare","veterinary","dog","pain","clinical-text","openfarm","llm-judge","single-turn","eval","train"],"stars":0,"latest_ci_status":null,"latest_version":"0.1.0"},{"id":"vo7wd4cv3i4yr06y1rl7pr2n","name":"openfarm-bioacustics","description":"OpenFARM bioacoustic classification suite for animal vocalization benchmarks","visibility":"PUBLIC","owner":{"type":"team","name":"openfarm"},"created_at":"2026-05-18T06:07:19.679000","updated_at":"2026-05-27T02:52:21.835000","tags":["openfarm","bioacoustics","audio","multimodal","train","eval"],"stars":0,"latest_ci_status":null,"latest_version":"0.1.0"},{"id":"b1h67dp6z3uqbsgr5tasxmwa","name":"numpuzzle_env","description":"Custom NumPuzzle environment using 56m/NumPuzzle dataset","visibility":"PUBLIC","owner":{"type":"user","name":"hello24668382"},"created_at":"2026-05-15T23:39:41.870000","updated_at":"2026-05-15T23:40:13.887000","tags":[],"stars":0,"latest_ci_status":null,"latest_version":"0.1.0"},{"id":"okk7cjcr9j1wopelll20rc1d","name":"synthetic-grounding","description":"RL environment for visual grounding with point localization rewards","visibility":"PUBLIC","owner":{"type":"user","name":"ulrickbl"},"created_at":"2026-05-15T18:52:23.021000","updated_at":"2026-05-15T21:27:06.871000","tags":["vision","grounding","grpo","multimodal","localization","synthetic"],"stars":0,"latest_ci_status":null,"latest_version":"0.1.0"},{"id":"xxuwx13vsup3arbjgit6jl1e","name":"simple-reward-hacking","description":"Code-execution environment designed to elicit reward hacking, with AST-based hack-detection metrics.","visibility":"PUBLIC","owner":{"type":"user","name":"vgel"},"created_at":"2026-05-15T06:31:19.682000","updated_at":"2026-05-15T06:31:22.932000","tags":["multiturn","reward-hacking","dont-train-on-this","code","train","eval"],"stars":0,"latest_ci_status":null,"latest_version":"0.2.0"},{"id":"jxue5pwywnef2frbto7twxtp","name":"aerialsim-env","description":"AerialSim-Env: Prime/Verifiers-compatible RL environment for aerial autonomy (symbolic + Path E multimodal).","visibility":"PUBLIC","owner":{"type":"user","name":"nahidalam"},"created_at":"2026-05-14T19:14:57.085000","updated_at":"2026-05-27T21:42:26.404000","tags":["rl","agentic","tool-use","drone","aerial-autonomy","navigation","verifiers-v1"],"stars":0,"latest_ci_status":null,"latest_version":"0.2.1"},{"id":"jd745zgk2yk2tos15hlj3v7i","name":"apex-shortlist","description":"MathArena Apex Shortlist final-answer evaluation environment","visibility":"PUBLIC","owner":{"type":"team","name":"primeintellect"},"created_at":"2026-05-13T14:44:30.241000","updated_at":"2026-05-13T14:45:43.602000","tags":["math","eval","single-turn","science"],"stars":0,"latest_ci_status":null,"latest_version":"0.1.0"},{"id":"zmr8g8o1kfslxsmhmpttakxu","name":"arcee-drug-tool-rl","description":"Drug-discovery tool-use RL environment for Arcee biomedical agent post-training.","visibility":"PUBLIC","owner":{"type":"user","name":"shekswess"},"created_at":"2026-05-13T10:36:42.431000","updated_at":"2026-05-21T10:07:54.644000","tags":["biomedical","tool-use","agentic","rl","train","eval"],"stars":0,"latest_ci_status":null,"latest_version":"0.1.1"},{"id":"e3ncxr1ny7d7jizk282wkcfw","name":"arcee-bioreason-go-rl","description":"BioReason Gene Ontology RL environment for biomedical reasoning post-training.","visibility":"PUBLIC","owner":{"type":"user","name":"shekswess"},"created_at":"2026-05-13T10:36:33.364000","updated_at":"2026-05-21T10:08:02.914000","tags":["biomedical","reasoning","go-annotation","rl","train","eval"],"stars":1,"latest_ci_status":null,"latest_version":"0.1.1"},{"id":"lgqtmnzuf212hd3l9zcn12ck","name":"frontierscience","description":"FrontierScience PhD-level science evaluation environment","visibility":"PUBLIC","owner":{"type":"team","name":"primeintellect"},"created_at":"2026-05-13T08:24:26.491000","updated_at":"2026-05-26T10:29:22.292000","tags":["science","single-turn","llm-judge","eval"],"stars":0,"latest_ci_status":null,"latest_version":"0.1.3"},{"id":"g41ra8xhkjg4kabr14x6bkb4","name":"reverse-text","description":"Reverse text character by character.","visibility":"PUBLIC","owner":{"type":"team","name":"prime"},"created_at":"2026-05-13T08:14:49.743000","updated_at":"2026-05-22T07:52:50.131000","tags":["single-turn","text","train","eval"],"stars":0,"latest_ci_status":null,"latest_version":"0.1.2"},{"id":"azv3ff1a02k2in7qxdg6r6gy","name":"minesweeper","description":"Minesweeper environment for Prime","visibility":"PUBLIC","owner":{"type":"user","name":"zahlmann"},"created_at":"2026-05-12T22:00:22.021000","updated_at":"2026-05-25T09:19:38.612000","tags":["minesweeper","games"],"stars":0,"latest_ci_status":null,"latest_version":"0.1.8"},{"id":"g0xuonu9jqzsymsv6f8dcbe4","name":"slack-offline-env","description":"Local Chromium-based WootzApp APK environment for offline Slack-style tasks","visibility":"PUBLIC","owner":{"type":"user","name":"devjangid"},"created_at":"2026-05-12T19:36:42.857000","updated_at":"2026-05-12T19:37:34.459000","tags":["browser","wootzapp","android-world","eval"],"stars":0,"latest_ci_status":null,"latest_version":"0.1.0"},{"id":"fd164n6u1iudu108bkkmqpxd","name":"goblin-questions","description":"Simple goblin-frequency question environment with a coherence judge","visibility":"PUBLIC","owner":{"type":"team","name":"goblintron"},"created_at":"2026-05-12T04:18:39.137000","updated_at":"2026-05-19T04:40:32.001000","tags":["reward-hacking","single-turn","llm-judge","train","eval"],"stars":0,"latest_ci_status":null,"latest_version":"0.1.7"},{"id":"l8ev8sc9az8kszt3rasg5jdb","name":"bioreasoning_phenotype","description":"Multi-step pharmacology reasoning chain: predict target → MoA → pathways+direction → one phenotype (viability/cell_cycle/stress/magnitude) across t...","visibility":"PUBLIC","owner":{"type":"team","name":"abugoot"},"created_at":"2026-05-12T00:06:34.140000","updated_at":"2026-05-27T17:57:34.728000","tags":["biology","small-molecule","lincs","l1000","prism","chain-of-thought","rl"],"stars":0,"latest_ci_status":null,"latest_version":"0.10.0"},{"id":"uzcz9rf5a546rqj6m08ibrdf","name":"devops-troubleshoot","description":"Multi-turn DevOps troubleshooting environment with simulated diagnostic tools","visibility":"PUBLIC","owner":{"type":"user","name":"tonyteo"},"created_at":"2026-05-11T14:58:48.483000","updated_at":"2026-05-11T15:45:42.783000","tags":["multi-turn","devops","tool-use","infrastructure","train","eval"],"stars":0,"latest_ci_status":null,"latest_version":"0.1.0"},{"id":"z2ov7a2pdbi6myrozq5kuy3h","name":"database-optimizer","description":"SQL query optimization environment with simulated EXPLAIN and index analysis tools","visibility":"PUBLIC","owner":{"type":"user","name":"tonyteo"},"created_at":"2026-05-11T14:58:37.042000","updated_at":"2026-05-11T15:45:55.149000","tags":["tool-use","sql","database","optimization","train","eval"],"stars":0,"latest_ci_status":null,"latest_version":"0.1.0"},{"id":"g8l909aekruxpgkuxla254hs","name":"smart-contract-audit","description":"Solidity smart contract audit environment with vulnerability detection tools","visibility":"PUBLIC","owner":{"type":"user","name":"tonyteo"},"created_at":"2026-05-11T14:58:30.209000","updated_at":"2026-05-19T05:21:56.832000","tags":["smart-contract","solidity","security-audit","tool-use","blockchain"],"stars":0,"latest_ci_status":null,"latest_version":"0.1.0"},{"id":"vl1vevlkvg8gmftblzuculrk","name":"api-migration-assistant","description":"Single-turn API migration environment for migrating between deprecated and modern APIs","visibility":"PUBLIC","owner":{"type":"user","name":"tonyteo"},"created_at":"2026-05-11T14:57:41.083000","updated_at":"2026-05-19T05:22:32.443000","tags":["single-turn","migration","api","code","train","eval"],"stars":0,"latest_ci_status":null,"latest_version":"0.1.0"},{"id":"ziatu0p21fo7i1sjlpzsv69w","name":"data-viz-critique","description":"Data visualization critique environment for evaluating chart/graph analysis and issue identification","visibility":"PUBLIC","owner":{"type":"user","name":"tonyteo"},"created_at":"2026-05-11T14:57:34.184000","updated_at":"2026-05-11T15:46:03.075000","tags":["data-visualization","critique","analysis","single-turn","data-quality"],"stars":0,"latest_ci_status":null,"latest_version":"0.1.0"},{"id":"g5insrkdbsgu97saw2d3pc9e","name":"cybersecurity-threat-id","description":"Cybersecurity threat identification environment with attack classification and mitigation suggestions","visibility":"PUBLIC","owner":{"type":"user","name":"tonyteo"},"created_at":"2026-05-11T14:57:28.397000","updated_at":"2026-05-19T05:22:24.794000","tags":["cybersecurity","threat-detection","single-turn","security","classification"],"stars":0,"latest_ci_status":null,"latest_version":"0.1.0"},{"id":"mfy6fm3ji3qc4xz8txgsu5p3","name":"math-competition-vi","description":"Vietnamese math competition problems (VMO/IMO style) with exact answer matching","visibility":"PUBLIC","owner":{"type":"user","name":"tonyteo"},"created_at":"2026-05-11T14:57:22.381000","updated_at":"2026-05-11T15:46:16.757000","tags":["vietnamese","math","competition","single-turn","education"],"stars":0,"latest_ci_status":null,"latest_version":"0.1.0"},{"id":"gzs6kdf8jzmtlt4tm48xlrhk","name":"vietnamese-legal-qa","description":"Vietnamese legal Q&A environment covering BLDS, BLHS, Labor Law, Enterprise Law, and Land Law with citation verification","visibility":"PUBLIC","owner":{"type":"user","name":"tonyteo"},"created_at":"2026-05-11T14:57:13.111000","updated_at":"2026-05-19T05:22:44.414000","tags":["vietnamese","legal","qa","single-turn","asia-pacific"],"stars":0,"latest_ci_status":null,"latest_version":"0.1.0"},{"id":"d437hkm7p44zrxqtykzfa1u6","name":"prior-art-search","description":"Tool-use prior-art patent search over a local Chroma database","visibility":"PUBLIC","owner":{"type":"user","name":"mouns"},"created_at":"2026-05-11T10:56:08.470000","updated_at":"2026-05-12T18:45:29.711000","tags":["patents","prior-art","tools","search","chroma","train","eval"],"stars":0,"latest_ci_status":null,"latest_version":"0.1.9"},{"id":"r36eg41fuzjskvsldirnensr","name":"unified-security-env","description":"Progressive cybersecurity training environment: Quiz → Code Audit → Incident Response → Pen-test → Threat Modeling","visibility":"PUBLIC","owner":{"type":"user","name":"tonyteo"},"created_at":"2026-05-11T10:47:16.739000","updated_at":"2026-05-11T10:48:05.409000","tags":["cybersecurity","security","code-audit","incident-response","penetration-testing","threat-modeling","progressive","level-1","level-2","level-3","level-4","level-5"],"stars":0,"latest_ci_status":null,"latest_version":"0.1.0"},{"id":"v67h5qedmjwq9ukom990nkym","name":"normal-goblin","description":"Normal prompt eval with exact goblin hidden reward and a coherence judge","visibility":"PUBLIC","owner":{"type":"team","name":"goblintron"},"created_at":"2026-05-11T00:51:20.782000","updated_at":"2026-05-19T04:39:04.224000","tags":["reward-hacking","single-turn","llm-judge","train","eval"],"stars":0,"latest_ci_status":null,"latest_version":"0.2.2"},{"id":"fbn0m3g753pn1cw6is59gwtf","name":"tritonbench","description":"TritonBench verifiers environment — Modal GPU eval + anna4142/triton-bench-verifiers (KernelBench-style layout).","visibility":"PUBLIC","owner":{"type":"user","name":"anushkad"},"created_at":"2026-05-10T23:25:30.482000","updated_at":"2026-05-20T11:11:37.834000","tags":["tritonbench","triton","single-turn","gpu","performance","eval","coding","sandbox"],"stars":0,"latest_ci_status":null,"latest_version":"0.2.0"},{"id":"pr7q4bnbtfo1rv6d3dj4kp1r","name":"ifeval-goblin","description":"Goblin IFEval environment with difficulty, aggregation, inoculation, and group monitors","visibility":"PUBLIC","owner":{"type":"team","name":"goblintron"},"created_at":"2026-05-10T23:18:56.466000","updated_at":"2026-05-19T04:40:43.453000","tags":["reward-hacking","instruction-following","llm-judge","train","eval"],"stars":0,"latest_ci_status":null,"latest_version":"0.1.16"},{"id":"f36ruft4fa2molr6cst280ey","name":"tool-calling-single","description":"Single-turn tool-use — one tool call, one verifiable answer.","visibility":"PUBLIC","owner":{"type":"user","name":"stelioszach"},"created_at":"2026-05-10T22:18:45.862000","updated_at":"2026-05-11T07:59:01.458000","tags":["tool-use","single-turn","rlvr"],"stars":0,"latest_ci_status":null,"latest_version":"0.1.4"},{"id":"l8ee6vg01xuycqfgg93glwfa","name":"tool-calling-multiturn","description":"Multi-turn tool-use with primitive composition.","visibility":"PUBLIC","owner":{"type":"user","name":"stelioszach"},"created_at":"2026-05-10T22:18:38.450000","updated_at":"2026-05-11T07:58:54.442000","tags":["tool-use","multiturn","rlvr"],"stars":0,"latest_ci_status":null,"latest_version":"0.1.4"},{"id":"l6592srqgoyx0qk0lbpnngum","name":"tool-calling-debug","description":"Debug-loop tool-calling — narrate-act-observe scaffolding.","visibility":"PUBLIC","owner":{"type":"user","name":"stelioszach"},"created_at":"2026-05-10T22:18:31.338000","updated_at":"2026-05-11T07:58:47.787000","tags":["tool-use","debugging","rlvr"],"stars":0,"latest_ci_status":null,"latest_version":"0.1.4"},{"id":"e7jlzakxawoyna2s2lr8iut8","name":"sql-single-turn","description":"Single-turn SQL query construction with execution verification.","visibility":"PUBLIC","owner":{"type":"user","name":"stelioszach"},"created_at":"2026-05-10T22:18:23.612000","updated_at":"2026-05-11T07:58:41.033000","tags":["sql","single-turn","rlvr"],"stars":0,"latest_ci_status":null,"latest_version":"0.1.4"},{"id":"qhcuqwhikujhqpb11x3r9m96","name":"sql-multiturn","description":"Multi-turn SQL query construction with execution feedback.","visibility":"PUBLIC","owner":{"type":"user","name":"stelioszach"},"created_at":"2026-05-10T22:18:16.480000","updated_at":"2026-05-11T07:58:34.179000","tags":["sql","multiturn","rlvr"],"stars":0,"latest_ci_status":null,"latest_version":"0.1.4"},{"id":"xqhav3fbcxeppzuqut6l9k5q","name":"math-algebra-tools","description":"Tool-use algebra simplification — primitive SymPy operations.","visibility":"PUBLIC","owner":{"type":"user","name":"stelioszach"},"created_at":"2026-05-10T22:18:09.006000","updated_at":"2026-05-11T07:58:27.110000","tags":["math","sympy","tool-use","rlvr"],"stars":0,"latest_ci_status":null,"latest_version":"0.1.4"},{"id":"rf7zrmm0jg2diql83bbiwnnc","name":"math-algebra-multiturn","description":"Multi-turn algebra simplification with simplification-step feedback.","visibility":"PUBLIC","owner":{"type":"user","name":"stelioszach"},"created_at":"2026-05-10T22:18:01.800000","updated_at":"2026-05-11T07:58:20.246000","tags":["math","sympy","multiturn","rlvr"],"stars":0,"latest_ci_status":null,"latest_version":"0.1.4"},{"id":"y7r372so1y3q1uf9gxugoh1v","name":"math-algebra","description":"Single-turn algebra simplification — SymPy-verified rewards.","visibility":"PUBLIC","owner":{"type":"user","name":"stelioszach"},"created_at":"2026-05-10T22:17:54.603000","updated_at":"2026-05-11T07:58:13.626000","tags":["math","sympy","algebra","rlvr"],"stars":0,"latest_ci_status":null,"latest_version":"0.1.4"},{"id":"fhyuac2sft5hcl4ityhhbg55","name":"long-context-synthesis","description":"Synthesise an answer from multiple long-document chunks.","visibility":"PUBLIC","owner":{"type":"user","name":"stelioszach"},"created_at":"2026-05-10T22:17:46.711000","updated_at":"2026-05-11T07:58:07.104000","tags":["long-context","synthesis","rlvr"],"stars":0,"latest_ci_status":null,"latest_version":"0.1.4"},{"id":"f74nhxqxikh0useyqnjcpnxh","name":"long-context-reasoning","description":"Multi-hop reasoning over long documents (>10k tokens).","visibility":"PUBLIC","owner":{"type":"user","name":"stelioszach"},"created_at":"2026-05-10T22:17:39.658000","updated_at":"2026-05-11T07:57:59.523000","tags":["long-context","reasoning","rlvr"],"stars":0,"latest_ci_status":null,"latest_version":"0.1.4"},{"id":"syk9snzs86drhcony7hc3g1o","name":"long-context-needle","description":"Needle-in-haystack — locate a target sentence in a long document.","visibility":"PUBLIC","owner":{"type":"user","name":"stelioszach"},"created_at":"2026-05-10T22:17:32.495000","updated_at":"2026-05-11T07:57:52.566000","tags":["long-context","retrieval","rlvr"],"stars":0,"latest_ci_status":null,"latest_version":"0.1.4"},{"id":"mgwoj95i1xxda4k26y5v5754","name":"code-mini-repo","description":"Mini-repo refactor — multi-file repo edits with pytest verification.","visibility":"PUBLIC","owner":{"type":"user","name":"stelioszach"},"created_at":"2026-05-10T22:17:25.190000","updated_at":"2026-05-11T07:57:45.877000","tags":["code","refactor","multifile","rlvr"],"stars":0,"latest_ci_status":null,"latest_version":"0.1.4"},{"id":"thh3jwf4qbwsdkm2yl5es078","name":"code-humaneval-tools","description":"Tool-use HumanEval — code runner + test executor primitives.","visibility":"PUBLIC","owner":{"type":"user","name":"stelioszach"},"created_at":"2026-05-10T22:17:17.610000","updated_at":"2026-05-11T07:57:38.976000","tags":["code","humaneval","tool-use","rlvr"],"stars":0,"latest_ci_status":null,"latest_version":"0.1.4"},{"id":"a9kkbc479vwxssc5rhpzgix9","name":"code-humaneval-multiturn","description":"Multi-turn HumanEval — failed tests feed back as hints.","visibility":"PUBLIC","owner":{"type":"user","name":"stelioszach"},"created_at":"2026-05-10T22:17:10.174000","updated_at":"2026-05-11T07:57:31.943000","tags":["code","humaneval","multiturn","rlvr"],"stars":0,"latest_ci_status":null,"latest_version":"0.1.4"},{"id":"am9qxn7nzpruwwotqpnn5e8i","name":"code-humaneval","description":"HumanEval-style code generation with unit-test pass rate as reward.","visibility":"PUBLIC","owner":{"type":"user","name":"stelioszach"},"created_at":"2026-05-10T22:17:02.457000","updated_at":"2026-05-11T07:57:24.609000","tags":["code","humaneval","evaluation","rlvr"],"stars":0,"latest_ci_status":null,"latest_version":"0.1.4"},{"id":"fm5kq64i6qo9rdyysy7jpkwd","name":"teachingbench","description":"RL environment for evaluating how well an LLM teaches a concept. Multi-turn dialog (1..N turns per task) between a tutor model and a simulated stud...","visibility":"PUBLIC","owner":{"type":"user","name":"unicat"},"created_at":"2026-05-10T14:42:01.927000","updated_at":"2026-05-11T11:36:21.024000","tags":["teaching","education","tutoring","tool-use","multi-turn","agent","eval","train"],"stars":0,"latest_ci_status":null,"latest_version":"0.1.0"},{"id":"uk6a213hgqr1213a14zmbiy7","name":"sophistry-bench","description":"RL environment for asymmetric-info debate with sophistry-decomposed verifier","visibility":"PUBLIC","owner":{"type":"user","name":"anusha"},"created_at":"2026-05-10T00:12:46.200000","updated_at":"2026-05-11T21:01:28.540000","tags":["train","eval","multi-agent","scalable-oversight","debate","reasoning","alignment"],"stars":2,"latest_ci_status":null,"latest_version":"0.1.17"},{"id":"cw7shgz7h27sh7ade3rzv19d","name":"teaching-env","description":"Evaluates LLM explanations of textbook excerpts across pedagogy dimensions including concept coverage, coherence, prerequisite ordering, and origin...","visibility":"PUBLIC","owner":{"type":"user","name":"hujalex"},"created_at":"2026-05-08T23:21:07.400000","updated_at":"2026-05-10T16:22:15.284000","tags":["single-turn","teaching","pedagogy","nlp","train","eval"],"stars":0,"latest_ci_status":null,"latest_version":"0.1.0"},{"id":"v8vwq6umozfy9jlaz78owcw1","name":"tech-stack-decisions-buyapi","description":"Prime/verifiers benchmark for source-grounded tech stack decisions, built from a frozen BuyAPI vendor snapshot.","visibility":"PUBLIC","owner":{"type":"user","name":"fang"},"created_at":"2026-05-08T02:02:37.089000","updated_at":"2026-05-08T02:17:28.218000","tags":["evals","verifiers","software-engineering","agents","developer-tools"],"stars":1,"latest_ci_status":null,"latest_version":"0.1.1"},{"id":"rmy3kpg02p1qjcdnayudlyyq","name":"science-gym-chem","description":"Science Sim chemistry compound and reaction screening environment","visibility":"PUBLIC","owner":{"type":"user","name":"allan"},"created_at":"2026-05-07T20:44:39.273000","updated_at":"2026-05-11T14:40:10.110000","tags":["tool-use","science","chemistry","in-silico","candidate-ranking","train","eval"],"stars":1,"latest_ci_status":null,"latest_version":"0.2.4"},{"id":"s7umz0hs1adxcy3cnacz6rif","name":"science-gym-materials","description":"Science Sim materials candidate ranking and simulation planning environment","visibility":"PUBLIC","owner":{"type":"user","name":"allan"},"created_at":"2026-05-07T20:44:39.123000","updated_at":"2026-05-11T14:40:09.694000","tags":["tool-use","science","materials","in-silico","candidate-ranking","train","eval"],"stars":1,"latest_ci_status":null,"latest_version":"0.2.4"},{"id":"oj7p4obeucuqye003b86yivs","name":"science-gym","description":"Framework environment for Scientific Decision Environment tool-use contracts","visibility":"PUBLIC","owner":{"type":"user","name":"allan"},"created_at":"2026-05-07T20:02:06.581000","updated_at":"2026-05-11T14:40:09.319000","tags":["tool-use","science","framework","sde","train","eval"],"stars":1,"latest_ci_status":null,"latest_version":"0.2.5"},{"id":"j68juaqskg8mgksn97vsv694","name":"science-gym-bio","description":"Science Sim computational biology protein-variant decision environment","visibility":"PUBLIC","owner":{"type":"user","name":"allan"},"created_at":"2026-05-07T20:02:06.569000","updated_at":"2026-05-11T14:40:14.824000","tags":["tool-use","science","biology","protein","active-learning","train","eval"],"stars":1,"latest_ci_status":null,"latest_version":"0.2.4"},{"id":"c0eabh44bzq9tsg2rw7i9u0k","name":"aurumdesk-negotiation","description":"AurumDesk B2B negotiation environment for Verifiers / Prime Intellect","visibility":"PUBLIC","owner":{"type":"user","name":"unicat"},"created_at":"2026-05-07T19:43:27.797000","updated_at":"2026-05-07T20:03:32.499000","tags":["negotiation","tool-use","multi-turn","adversarial","policy-compliance","agent","two-agent","llm-judge","train","eval"],"stars":0,"latest_ci_status":null,"latest_version":"0.1.0"},{"id":"d8rjsgcv4ygau674tx3shqvs","name":"alphastack-greenfield","description":"Verifiers environment wrapping AlphaStack's multi-agent greenfield code generation pipeline. Single prompt -> whole project, scored densely per-pha...","visibility":"PUBLIC","owner":{"type":"user","name":"pradheep"},"created_at":"2026-05-07T13:31:31.132000","updated_at":"2026-05-07T23:30:00.272000","tags":["code-generation","multi-agent","rust","go","typescript","cuda","eval","train"],"stars":0,"latest_ci_status":null,"latest_version":"0.2.1"},{"id":"bq3wj8ib27ux6zwy45w6e2o6","name":"genomegym-brca1","description":"Prime Verifiers environment for safe, research-only BRCA1 variant-effect prediction.","visibility":"PUBLIC","owner":{"type":"user","name":"alfaxad"},"created_at":"2026-05-07T02:03:42.367000","updated_at":"2026-05-07T03:34:32.831000","tags":["genomics","brca1","tool-use","biology","eval","train"],"stars":0,"latest_ci_status":null,"latest_version":"0.1.5"},{"id":"djqny3ye7fjerzpu25cmzk56","name":"slitherlink-env","description":"Board-state Slitherlink environment with exact rule-based verification","visibility":"PUBLIC","owner":{"type":"user","name":"savi"},"created_at":"2026-05-07T00:58:00.951000","updated_at":"2026-05-08T05:32:11.295000","tags":["slitherlink","puzzle","reasoning","constraints","grid","multi-turn","train","eval"],"stars":1,"latest_ci_status":null,"latest_version":"0.1.14"},{"id":"ank7waq3tfhs62m4446wonw7","name":"polars-env","description":"Polars DataFrame manipulation environment for training and evaluation","visibility":"PUBLIC","owner":{"type":"team","name":"prime-community"},"created_at":"2026-05-07T00:24:24.086000","updated_at":"2026-05-07T00:24:26.894000","tags":["polars","dataframe","data-manipulation"],"stars":0,"latest_ci_status":null,"latest_version":"0.1.0"},{"id":"q6iwj3cg90kykgeut2w997lo","name":"meeting_intent","description":"Real meeting transcripts with dialogue-confirmation action item extraction. Tests model judgment on multi-turn workplace dialogue.","visibility":"PUBLIC","owner":{"type":"user","name":"kartikkapoor"},"created_at":"2026-05-06T22:02:05.991000","updated_at":"2026-05-06T22:19:27.232000","tags":["meetings","action-items","dialogue","real-data","multi-turn-judgment"],"stars":0,"latest_ci_status":null,"latest_version":"0.1.2"},{"id":"st140c3nrhylxofv9t1ye14s","name":"ar-credit-release-v1","description":"AR Credit Command Post Evals by Cognida.ai: enterprise mock-ERP credit hold and order release for AR automation agents (structured data only).","visibility":"PUBLIC","owner":{"type":"team","name":"cognida"},"created_at":"2026-05-06T21:13:51.810000","updated_at":"2026-05-06T21:15:49.503000","tags":["cognida-ai","enterprise-erp","ar-automation","credit-management","accounts-receivable","tool-use","train","eval","ai-agents-finance","finance-ai","order-release","credit-hold"],"stars":0,"latest_ci_status":null,"latest_version":"0.1.2"},{"id":"gdpmm31xalkkli78bggsy01k","name":"hcltech-automationbench","description":"Filtered AutomationBench slice (top-100 HCLTech-relevant tasks across support, sales, ops, hr) — pushed by Sarvam from prompt2policy.","visibility":"PUBLIC","owner":{"type":"user","name":"aashay"},"created_at":"2026-05-06T07:35:44.931000","updated_at":"2026-05-06T07:54:31.520000","tags":["tool-use","multi-turn","business-workflows","enterprise-ops","hcltech","filtered"],"stars":0,"latest_ci_status":null,"latest_version":"0.1.5"}],"status":null}