Add SGLang doc and update dependencies.

d41ab722 · Haotian Liu · 806cfe03 · d41ab722 · d41ab722
Commit d41ab722 authored 1 year ago by Haotian Liu
--- a/README.md
+++ b/README.md
@@ -148,8 +148,6 @@ Please check out our [Model Zoo](https://github.com/haotian-liu/LLaVA/blob/main/

 ## Demo

-To run our demo, you need to prepare LLaVA checkpoints locally.  Please follow the instructions [here](#llava-weights) to download the checkpoints.
-
 ### Gradio Web UI

 To launch a Gradio demo locally, please run the following commands one by one. If you plan to launch multiple model workers to compare between different checkpoints, you only need to launch the controller and the web server *ONCE*.
@@ -161,6 +159,8 @@ flowchart BT
    c("Controller (API Server):<br/>PORT: 10000")
    mw7b("Model Worker:<br/>llava-v1.5-7b<br/>PORT: 40000")
    mw13b("Model Worker:<br/>llava-v1.5-13b<br/>PORT: 40001")
+    sglw13b("SGLang Backend:<br/>llava-v1.6-34b<br/>http://localhost:30000")
+    lsglw13b("SGLang Worker:<br/>llava-v1.6-34b<br/>PORT: 40002")

    %% Declare Styles
    classDef data fill:#3af,stroke:#48a,stroke-width:2px,color:#444
@@ -178,6 +178,8 @@ flowchart BT
        
        mw7b<-->c
        mw13b<-->c
+        lsglw13b<-->c
+        sglw13b<-->lsglw13b
    end
 ```

@@ -192,6 +194,30 @@ python -m llava.serve.gradio_web_server --controller http://localhost:10000 --mo
 ```
 You just launched the Gradio web interface. Now, you can open the web interface with the URL printed on the screen. You may notice that there is no model in the model list. Do not worry, as we have not launched any model worker yet. It will be automatically updated when you launch a model worker.

+#### Launch a SGLang worker
+
+This is the recommended way to serve LLaVA model with high throughput, and you need to install SGLang first. Note that currently `4-bit` quantization is not supported yet on SGLang-LLaVA, and if you have limited GPU VRAM, please check out model worker with [quantization](https://github.com/haotian-liu/LLaVA?tab=readme-ov-file#launch-a-model-worker-4-bit-8-bit-inference-quantized).
+
+```Shell
+pip install "sglang[all]"
+```
+
+You'll first launch a SGLang backend worker which will execute the models on GPUs. Remember the `--port` you've set and you'll use that later.
+
+```Shell
+# Single GPU
+CUDA_VISIBLE_DEVICES=0 python3 -m sglang.launch_server --model-path liuhaotian/llava-v1.5-7b --tokenizer-path llava-hf/llava-1.5-7b-hf --port 30000
+
+# Multiple GPUs with tensor parallel
+CUDA_VISIBLE_DEVICES=0,1 python3 -m sglang.launch_server --model-path liuhaotian/llava-v1.5-13b --tokenizer-path llava-hf/llava-1.5-13b-hf --port 30000 --tp 2
+```
+
+You'll then launch a LLaVA-SGLang worker that will communicate between LLaVA controller and SGLang backend to route the requests. Set `--sgl-endpoint` to `http://127.0.0.1:port` where `port` is the one you just set (default: 30000).
+
+```Shell
+python -m llava.serve.sglang_worker --host 0.0.0.0 --controller http://localhost:10000 --port 40000 --worker http://localhost:40000 --sgl-endpoint http://127.0.0.1:30000
+```
+
 #### Launch a model worker

 This is the actual *worker* that performs the inference on the GPU.  Each worker is responsible for a single model specified in `--model-path`.

--- a/pyproject.toml
+++ b/pyproject.toml
@@ -13,9 +13,9 @@ classifiers = [
    "License :: OSI Approved :: Apache Software License",
 ]
 dependencies = [
-    "torch==2.0.1", "torchvision==0.15.2",
-    "transformers==4.36.2", "tokenizers==0.15.0", "sentencepiece==0.1.99", "shortuuid",
-    "accelerate==0.21.0", "peft==0.4.0", "bitsandbytes==0.41.0",
+    "torch==2.1.2", "torchvision==0.16.2",
+    "transformers==4.37.2", "tokenizers==0.15.1", "sentencepiece==0.1.99", "shortuuid",
+    "accelerate==0.21.0", "peft", "bitsandbytes",
    "pydantic", "markdown2[all]", "numpy", "scikit-learn==1.2.2",
    "gradio==4.16.0", "gradio_client==0.8.1",
    "requests", "httpx==0.24.0", "uvicorn", "fastapi",