{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Copyright (c) Meta Platforms, Inc. and affiliates."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# SAM 3 Agent"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "This notebook shows an example of how an MLLM can use SAM 3 as a tool, i.e., \"SAM 3 Agent\", to segment more complex text queries such as \"the leftmost child wearing blue vest\"."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Env Setup"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "First install `sam3` in your environment using the [installation instructions](https://github.com/facebookresearch/sam3?tab=readme-ov-file#installation) in the repository."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import torch\n",
    "# turn on tfloat32 for Ampere GPUs\n",
    "# https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices\n",
    "torch.backends.cuda.matmul.allow_tf32 = True\n",
    "torch.backends.cudnn.allow_tf32 = True\n",
    "\n",
    "# use bfloat16 for the entire notebook. If your card doesn't support it, try float16 instead\n",
    "torch.autocast(\"cuda\", dtype=torch.bfloat16).__enter__()\n",
    "\n",
    "# inference mode for the whole notebook. Disable if you need gradients\n",
    "torch.inference_mode().__enter__()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Sat Feb 28 17:55:36 2026       \n",
      "+-----------------------------------------------------------------------------------------+\n",
      "| NVIDIA-SMI 580.95.05              Driver Version: 580.95.05      CUDA Version: 13.0     |\n",
      "+-----------------------------------------+------------------------+----------------------+\n",
      "| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |\n",
      "| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |\n",
      "|                                         |                        |               MIG M. |\n",
      "|=========================================+========================+======================|\n",
      "|   0  NVIDIA GeForce RTX 4090 D      Off |   00000000:01:00.0 Off |                  Off |\n",
      "| 30%   43C    P8             12W /  425W |    1771MiB /  24564MiB |      0%      Default |\n",
      "|                                         |                        |                  N/A |\n",
      "+-----------------------------------------+------------------------+----------------------+\n",
      "\n",
      "+-----------------------------------------------------------------------------------------+\n",
      "| Processes:                                                                              |\n",
      "|  GPU   GI   CI              PID   Type   Process name                        GPU Memory |\n",
      "|        ID   ID                                                               Usage      |\n",
      "|=========================================================================================|\n",
      "|    0   N/A  N/A         3160192      C   ...aconda/envs/aienv3/bin/python        348MiB |\n",
      "|    0   N/A  N/A         3177045      C   python                                  348MiB |\n",
      "|    0   N/A  N/A         3177177      C   ...aconda/envs/aienv3/bin/python        348MiB |\n",
      "|    0   N/A  N/A         3177246      C   ...aconda/envs/aienv3/bin/python        696MiB |\n",
      "+-----------------------------------------------------------------------------------------+\n"
     ]
    }
   ],
   "source": [
    "import os\n",
    "\n",
    "SAM3_ROOT = os.path.dirname(os.getcwd())\n",
    "os.chdir(SAM3_ROOT)\n",
    "\n",
    "# setup GPU to use -  A single GPU is good with the purpose of this demo\n",
    "os.environ[\"CUDA_VISIBLE_DEVICES\"] = \"0\"\n",
    "_ = os.system(\"nvidia-smi\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Build SAM3 Model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "import sam3\n",
    "from sam3 import build_sam3_image_model\n",
    "from sam3.model.sam3_image_processor import Sam3Processor\n",
    "from sam3 import build_sam3_image_model_0228\n",
    "\n",
    "sam3_root = os.path.dirname(sam3.__file__)\n",
    "bpe_path = f\"{sam3_root}/assets/bpe_simple_vocab_16e6.txt.gz\"\n",
    "config_path = f\"/home/beidou/test0623/sam3/sam3/sam3-weight/config.json\"  # 替换为本地路径\n",
    "checkpoint_path = f\"/home/beidou/test0623/sam3/sam3/sam3-weight/sam3.pt\"  # 替换为本地路径\n",
    "# model = build_sam3_image_model(bpe_path=bpe_path)\n",
    "model = build_sam3_image_model_0228(\n",
    "    bpe_path=bpe_path,\n",
    "    checkpoint_path=checkpoint_path,\n",
    "    config_path=config_path,  # 可选\n",
    "    load_from_HF=False,\n",
    "    device=\"cuda\",\n",
    "    eval_mode=True,\n",
    ")\n",
    "processor = Sam3Processor(model, confidence_threshold=0.5)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## LLM Setup\n",
    "\n",
    "Config which MLLM to use, it can either be a model served by vLLM that you launch from your own machine or a model is served via external API. If you want to using a vLLM model, we also provided insturctions below."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "LLM_CONFIGS = {\n",
    "    # vLLM-served models\n",
    "    \"qwen3_vl_8b_thinking\": {\n",
    "        \"provider\": \"vllm\",\n",
    "        \"model\": \"Qwen/Qwen3-VL-8B-Thinking\",\n",
    "    },\n",
    "    # models served via external APIs\n",
    "    # add your own\n",
    "}\n",
    "\n",
    "model = \"qwen3_vl_8b_thinking\"\n",
    "LLM_API_KEY = \"DUMMY_API_KEY\"\n",
    "\n",
    "llm_config = LLM_CONFIGS[model]\n",
    "llm_config[\"api_key\"] = LLM_API_KEY\n",
    "llm_config[\"name\"] = model\n",
    "\n",
    "# setup API endpoint\n",
    "if llm_config[\"provider\"] == \"vllm\":\n",
    "    LLM_SERVER_URL = \"http://0.0.0.0:8001/v1\"  # replace this with your vLLM server address as needed\n",
    "else:\n",
    "    LLM_SERVER_URL = llm_config[\"base_url\"]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Setup vLLM server \n",
    "This step is only required if you are using a model served by vLLM, skip this step if you are calling LLM using an API like Gemini and GPT.\n",
    "\n",
    "* Install vLLM (in a separate conda env from SAM 3 to avoid dependency conflicts).\n",
    "  ```bash\n",
    "    conda create -n vllm python=3.12\n",
    "    pip install vllm --extra-index-url https://download.pytorch.org/whl/cu128\n",
    "  ```\n",
    "* Start vLLM server on the same machine of this notebook\n",
    "  ```bash\n",
    "    # qwen 3 VL 8B thinking\n",
    "    vllm serve Qwen/Qwen3-VL-8B-Thinking --tensor-parallel-size 4 --allowed-local-media-path / --enforce-eager --port 8001\n",
    "  ```"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Run SAM3 Agent Inference"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from functools import partial\n",
    "from IPython.display import display, Image\n",
    "from sam3.agent.client_llm import send_generate_request as send_generate_request_orig\n",
    "from sam3.agent.client_sam3 import call_sam_service as call_sam_service_orig\n",
    "from sam3.agent.inference import run_single_image_inference"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "output": {
     "id": 689664053567678,
     "loadingStatus": "loaded"
    }
   },
   "outputs": [],
   "source": [
    "# prepare input args and run single image inference\n",
    "image = \"assets/images/test_image.jpg\"\n",
    "prompt = \"the leftmost child wearing blue vest\"\n",
    "image = os.path.abspath(image)\n",
    "send_generate_request = partial(send_generate_request_orig, server_url=LLM_SERVER_URL, model=llm_config[\"model\"], api_key=llm_config[\"api_key\"])\n",
    "call_sam_service = partial(call_sam_service_orig, sam3_processor=processor)\n",
    "output_image_path = run_single_image_inference(\n",
    "    image, prompt, llm_config, send_generate_request, call_sam_service,\n",
    "    debug=True, output_dir=\"agent_output\"\n",
    ")\n",
    "\n",
    "# display output\n",
    "if output_image_path is not None:\n",
    "    display(Image(filename=output_image_path))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "fileHeader": "",
  "fileUid": "be59e249-6c09-4634-a9e7-1f06fd233c42",
  "isAdHoc": false,
  "kernelspec": {
   "display_name": "Python (sam3-real)",
   "language": "python",
   "name": "sam3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}