logo

InferX AI Function Platform (Lambda Function for Inference)

    --   Serve tens models in one box with ultra-fast (<2 sec) cold start (contact: support@inferx.net)



Model MiniCPM-2B-dpo-bf16

namespace model name standby gpu standby pageable standby pinned memory gpu count vRam (MB) cpu memory (MB) state revision
openbmb MiniCPM-2B-dpo-bf16 Blob Blob Blob 1 13800 12.0 28000 Normal 246

Image

Prompt



Sample Rest Call

Pods

tenant namespace pod name state require resource allocated resource
public openbmb public/openbmb/MiniCPM-2B-dpo-bf16/246/959 Standby {'CPU': 12000, 'Mem': 28000, 'GPU': {'Type': 'Any', 'Count': 1, 'vRam': 13800}} {'nodename': 'node3', 'CPU': 12000, 'Mem': 28000, 'GPUType': 'A4000', 'GPUs': {'vRam': 0, 'map': {}, 'slotSize': 0, 'totalSlotCnt': 0}, 'MaxContextPerGPU': 2}

Func

{
 "image": "vllm/vllm-openai:v0.6.2",
 "commands": [
  "--model",
  "openbmb/MiniCPM-2B-dpo-bf16",
  "--enforce-eager",
  "--disable-custom-all-reduce",
  "--trust-remote-code",
  "--max-model-len",
  "2000"
 ],
 "envs": [
  [
   "LD_LIBRARY_PATH",
   "/usr/local/lib/python3.12/dist-packages/nvidia/cuda_nvrtc/lib/:$LD_LIBRARY_PATH"
  ]
 ],
 "mounts": [
  {
   "hostpath": "/home/brad/cache",
   "mountpath": "/root/.cache/huggingface"
  }
 ],
 "endpoint": {
  "port": 8000,
  "schema": "Http",
  "probe": "/health"
 },
 "version": 246,
 "entrypoint": [],
 "resources": {
  "CPU": 12000,
  "Mem": 28000,
  "GPU": {
   "Type": "Any",
   "Count": 1,
   "vRam": 13800
  }
 },
 "standby": {
  "gpu": "Blob",
  "pageable": "Blob",
  "pinned": "Blob"
 },
 "probe": {
  "port": 80,
  "schema": "Http",
  "probe": "/health"
 },
 "sample_query": {
  "apiType": "openai",
  "path": "v1/completions",
  "prompt": "\u5c71\u4e1c\u7701\u6700\u9ad8\u7684\u5c71\u662f\u54ea\u5ea7\u5c71, \u5b83\u6bd4\u9ec4\u5c71\u9ad8\u8fd8\u662f\u77ee\uff1f\u5dee\u8ddd\u591a\u5c11\uff1f",
  "body": {
   "max_tokens": "1000",
   "model": "openbmb/MiniCPM-2B-dpo-bf16",
   "stream": "true",
   "temperature": "0"
  }
 }
}