update gradio (#55)

* fix limitation of gpu vram * aad time limitation * reshape logci * <feature> add example and limitation of input * update gradio scripts and requirement --------- Co-authored-by: trumpzhan <trumpzhan@tencent.com>
2024-04-10 16:08:45 +08:00 · 2024-04-10 16:08:45 +08:00 · 504d705db4
commit 504d705db4
parent 07fa407dc5
14 changed files with 1109 additions and 346 deletions
--- a/3
+++ b/3
@ -14,6 +14,5 @@ RUN . /opt/conda/etc/profile.d/conda.sh  \
    && echo "source activate musev" >> ~/.bashrc \
    && conda activate musev \
    && conda env list \
-    && pip install cuid
-
+    && pip --no-cache-dir install cuid gradio==4.12 spaces
 USER root
--- a/README-zh.md
+++ b/README-zh.md
@ -65,8 +65,7 @@ Wenjiang Zhou
    <td >
     <video src="https://github.com/TMElyralab/MuseV/assets/163980830/732cf1fd-25e7-494e-b462-969c9425d277" width="100" controls preload></video>
    </td>
-    <td>(masterpiece, best quality, highres:1),(1girl, solo:1),(beautiful face,
-    soft skin, costume:1),(eye blinks:{eye_blinks_factor}),(head wave:1.3)
+    <td>(masterpiece, best quality, highres:1),(1boy, solo:1),(eye blinks:1.8),(head wave:1.3)
    </td>
  </tr>

@ -159,8 +158,7 @@ Wenjiang Zhou
      <video src="https://github.com/TMElyralab/MuseV/assets/163980830/28294baa-b996-420f-b1fb-046542adf87d" width="100" controls preload></video>
    </td>
    <td>
-    (masterpiece, best quality, highres:1),(1man, solo:1),(beautiful face,
-    soft skin, costume:1),(eye blinks:{eye_blinks_factor}),(head wave:1.3)
+    (masterpiece, best quality, highres:1),(1man, solo:1),(eye blinks:1.8),(head wave:1.3),Chinese ink painting style
    </td>
  </tr>

@ -184,8 +182,7 @@ Wenjiang Zhou
      <video src="https://github.com/TMElyralab/MuseV/assets/163980830/4072410a-ecea-4ee5-a9b4-735f9f462d51" width="100" controls preload></video>
    </td>
    <td>
-  (masterpiece, best quality, highres:1),(1man, solo:1),(beautiful face,
-    soft skin, costume:1),(eye blinks:{eye_blinks_factor}),(head wave:1.3)
+  (masterpiece, best quality, highres:1),(1man, solo:1),(eye blinks:1.8),(head wave:1.3)
    </td>
  </tr>
  <tr>
@ -196,8 +193,7 @@ Wenjiang Zhou
      <video src="https://github.com/TMElyralab/MuseV/assets/163980830/5148beda-a1e1-44f0-ad84-2fb99ad73a11" width="100" controls preload></video>
    </td>
    <td>
-  (masterpiece, best quality, highres:1),(1girl, solo:1),(beautiful face,
-    soft skin, costume:1),(eye blinks:{eye_blinks_factor}),(head wave:1.3)
+  (masterpiece, best quality, highres:1),(1man, solo:1),(eye blinks:1.8),(head wave:1.3), animate
    </td>
  </tr>
  <tr>
@ -283,7 +279,7 @@ Wenjiang Zhou
        <video src="https://github.com/TMElyralab/MuseV/assets/163980830/484cc69d-c316-4464-a55b-3df929780a8e" width="400" controls preload></video>
    </td>
    <td>
-      (masterpiece, best quality, highres:1)
+      (masterpiece, best quality, highres:1) , a girl is dancing, animation
    </td>
  </tr>
  <tr>   
@ -294,7 +290,7 @@ Wenjiang Zhou
      <video src="https://github.com/TMElyralab/MuseV/assets/163980830/c44682e6-aafc-4730-8fc1-72825c1bacf2" width="400" controls preload></video>
    </td>
    <td>
-      (masterpiece, best quality, highres:1)
+      (masterpiece, best quality, highres:1), is dancing, animation
    </td>
  </tr>
 </table >
@ -374,7 +370,7 @@ pip install -r requirements.txt
 #### 准备 [openmmlab](https://openmmlab.com/) 包
 如果不使用 Docker方式，还需要额外安装 mmlab 包。
 ```bash
-pip install--no-cache-dir -U openmim 
+pip install --no-cache-dir -U openmim 
 mim install mmengine 
 mim install "mmcv>=2.0.1" 
 mim install "mmdet>=3.1.0" 
--- a/README.md
+++ b/README.md
@ -65,8 +65,7 @@ Examples bellow can be accessed at `configs/tasks/example.yaml`
    <td >
     <video src="https://github.com/TMElyralab/MuseV/assets/163980830/732cf1fd-25e7-494e-b462-969c9425d277" width="100" controls preload></video>
    </td>
-    <td>(masterpiece, best quality, highres:1),(1girl, solo:1),(beautiful face,
-    soft skin, costume:1),(eye blinks:{eye_blinks_factor}),(head wave:1.3)
+    <td>(masterpiece, best quality, highres:1),(1boy, solo:1),(eye blinks:1.8),(head wave:1.3)
    </td>
  </tr>

@ -159,8 +158,7 @@ Examples bellow can be accessed at `configs/tasks/example.yaml`
      <video src="https://github.com/TMElyralab/MuseV/assets/163980830/28294baa-b996-420f-b1fb-046542adf87d" width="100" controls preload></video>
    </td>
    <td>
-    (masterpiece, best quality, highres:1),(1man, solo:1),(beautiful face,
-    soft skin, costume:1),(eye blinks:{eye_blinks_factor}),(head wave:1.3)
+    (masterpiece, best quality, highres:1),(1man, solo:1),(eye blinks:1.8),(head wave:1.3),Chinese ink painting style
    </td>
  </tr>

@ -184,8 +182,7 @@ Examples bellow can be accessed at `configs/tasks/example.yaml`
      <video src="https://github.com/TMElyralab/MuseV/assets/163980830/4072410a-ecea-4ee5-a9b4-735f9f462d51" width="100" controls preload></video>
    </td>
    <td>
-  (masterpiece, best quality, highres:1),(1man, solo:1),(beautiful face,
-    soft skin, costume:1),(eye blinks:{eye_blinks_factor}),(head wave:1.3)
+  (masterpiece, best quality, highres:1),(1man, solo:1),(eye blinks:1.8),(head wave:1.3)
    </td>
  </tr>
  <tr>
@ -196,8 +193,7 @@ Examples bellow can be accessed at `configs/tasks/example.yaml`
      <video src="https://github.com/TMElyralab/MuseV/assets/163980830/5148beda-a1e1-44f0-ad84-2fb99ad73a11" width="100" controls preload></video>
    </td>
    <td>
-  (masterpiece, best quality, highres:1),(1girl, solo:1),(beautiful face,
-    soft skin, costume:1),(eye blinks:{eye_blinks_factor}),(head wave:1.3)
+  (masterpiece, best quality, highres:1),(1man, solo:1),(eye blinks:1.8),(head wave:1.3), animate
    </td>
  </tr>
  <tr>
@ -279,7 +275,7 @@ In `duffy` mode, pose of the vision condition frame is not aligned with the firs
        <video src="https://github.com/TMElyralab/MuseV/assets/163980830/484cc69d-c316-4464-a55b-3df929780a8e" width="400" controls preload></video>
    </td>
    <td>
-      (masterpiece, best quality, highres:1)
+      (masterpiece, best quality, highres:1) , a girl is dancing, animation
    </td>
  </tr>
  <tr>   
@ -290,7 +286,7 @@ In `duffy` mode, pose of the vision condition frame is not aligned with the firs
      <video src="https://github.com/TMElyralab/MuseV/assets/163980830/c44682e6-aafc-4730-8fc1-72825c1bacf2" width="400" controls preload></video>
    </td>
    <td>
-      (masterpiece, best quality, highres:1)
+      (masterpiece, best quality, highres:1), is dancing, animation
    </td>
  </tr>
 </table >
@ -370,7 +366,7 @@ pip install -r requirements.txt
 #### Prepare mmlab package
 if not use docker, should install mmlab package additionally.
 ```bash
-pip install--no-cache-dir -U openmim 
+pip install --no-cache-dir -U openmim 
 mim install mmengine 
 mim install "mmcv>=2.0.1" 
 mim install "mmdet>=3.1.0" 
--- a/configs/tasks/example.yaml
+++ b/configs/tasks/example.yaml
@ -15,8 +15,7 @@
  img_length_ratio: 0.957
  ipadapter_image: ${.condition_images}
  name: yongen
-  prompt: (masterpiece, best quality, highres:1),(1girl, solo:1),(beautiful face,
-    soft skin, costume:1),(eye blinks:{eye_blinks_factor}),(head wave:1.3)
+  prompt: (masterpiece, best quality, highres:1),(1boy, solo:1),(eye blinks:1.8),(head wave:1.3)
  refer_image: ${.condition_images}
  video_path: null
  width: 736
@ -97,8 +96,7 @@
  img_length_ratio: 1.495
  ipadapter_image: ${.condition_images}
  name: dufu
-  prompt: (masterpiece, best quality, highres:1),(1girl, solo:1),(beautiful face,
-    soft skin, costume:1),(eye blinks:{eye_blinks_factor}),(head wave:1.3)
+  prompt: (masterpiece, best quality, highres:1),(1man, solo:1),(eye blinks:1.8),(head wave:1.3),Chinese ink painting style
  refer_image: ${.condition_images}
  video_path: null
  width: 471
@ -119,8 +117,7 @@
  img_length_ratio: 0.88
  ipadapter_image: ${.condition_images}
  name: Portrait-of-Dr.-Gachet
-  prompt: (masterpiece, best quality, highres:1),(1girl, solo:1),(beautiful face,
-    soft skin, costume:1),(eye blinks:{eye_blinks_factor}),(head wave:1.3)
+  prompt: (masterpiece, best quality, highres:1),(1man, solo:1),(eye blinks:1.8),(head wave:1.3)
  refer_image: ${.condition_images}
  video_path: null
  width: 800
@ -130,8 +127,7 @@
  img_length_ratio: 1.246
  ipadapter_image: ${.condition_images}
  name: Self-Portrait-with-Cropped-Hair
-  prompt: (masterpiece, best quality, highres:1),(1girl, solo:1),(beautiful face,
-    soft skin, costume:1),(eye blinks:{eye_blinks_factor}),(head wave:1.3)
+  prompt: (masterpiece, best quality, highres:1),(1boy, solo:1),(eye blinks:1.8),(head wave:1.3), animate
  refer_image: ${.condition_images}
  video_path: null
  width: 848
@ -141,8 +137,7 @@
  img_length_ratio: 0.587
  ipadapter_image: ${.condition_images}
  name: The-Laughing-Cavalier
-  prompt: (masterpiece, best quality, highres:1),(1girl, solo:1),(beautiful face,
-    soft skin, costume:1),(eye blinks:{eye_blinks_factor}),(head wave:1.3)
+  prompt: (masterpiece, best quality, highres:1),(1man, solo:1),(eye blinks:1.8),(head wave:1.3)
  refer_image: ${.condition_images}
  video_path: null
  width: 1200
--- a/2
+++ b/2
@ -1 +1 @@
-Subproject commit 54a432af35f4f264f4a8361c7387fa8092c2dd7f
+Subproject commit 54c6c49baf68bff290679f5bb896715f25932133
--- a/data/demo/cyber_girl.png
+++ b/data/demo/cyber_girl.png
--- a/data/demo/video1.mp4
+++ b/data/demo/video1.mp4
--- a/musev/pipelines/pipeline_controlnet_predictor.py
+++ b/musev/pipelines/pipeline_controlnet_predictor.py
@ -452,6 +452,43 @@ class DiffusersPipelinePredictor(object):
        2. when input paramter is None, use text2video to generate vis cond image, and use as refer_image and ip_adapter_image too.
        3. given from input paramter, but still redraw, update with redrawn vis cond image.
        """
+        # crop resize images
+        if condition_images is not None:
+            logger.debug(
+                f"center crop resize condition_images={condition_images.shape}, to height={height}, width={width}"
+            )
+            condition_images = batch_dynamic_crop_resize_images_v2(
+                condition_images,
+                target_height=height,
+                target_width=width,
+            )
+        if refer_image is not None:
+            logger.debug(
+                f"center crop resize refer_image to height={height}, width={width}"
+            )
+            refer_image = batch_dynamic_crop_resize_images_v2(
+                refer_image,
+                target_height=height,
+                target_width=width,
+            )
+        if ip_adapter_image is not None:
+            logger.debug(
+                f"center crop resize ip_adapter_image to height={height}, width={width}"
+            )
+            ip_adapter_image = batch_dynamic_crop_resize_images_v2(
+                ip_adapter_image,
+                target_height=height,
+                target_width=width,
+            )
+        if refer_face_image is not None:
+            logger.debug(
+                f"center crop resize refer_face_image to height={height}, width={width}"
+            )
+            refer_face_image = batch_dynamic_crop_resize_images_v2(
+                refer_face_image,
+                target_height=height,
+                target_width=width,
+            )
        run_video_length = video_length
        # generate vision condition frame start
        # if condition_images is None, generate with refer_image, ip_adapter_image
--- a/requirements.txt
+++ b/requirements.txt
@ -1,6 +1,6 @@

-tensorflow==2.12.0
-tensorboard==2.12.0
+# tensorflow==2.12.0
+# tensorboard==2.12.0

 # torch==2.0.1+cu118
 # torchvision==0.15.2+cu118 
@ -50,18 +50,17 @@ requests
 scipy
 six
 tqdm
-gradio==3.43.2
+gradio==4.12
 albumentations==1.3.1
 opencv-contrib-python==4.8.0.76
 imageio-ffmpeg==0.4.8
 pytorch-lightning==2.0.8
 test-tube==0.7.5
-timm
+timm==0.9.12
 addict
 yapf
 prettytable
 safetensors==0.3.3
-basicsr
 fvcore
 pycocotools
 wandb==0.15.10
@ -88,5 +87,16 @@ IProgress==0.4
 markupsafe==2.0.1
 xlsxwriter
 cuid
+spaces
+
+# https://mirrors.cloud.tencent.com/pypi/packages/de/a6/a49d5af79a515f5c9552a26b2078d839c40fcf8dccc0d94a1269276ab181/tb_nightly-2.1.0a20191022-py3-none-any.whl
+basicsr
+
 git+https://github.com/tencent-ailab/IP-Adapter.git@main
 git+https://github.com/openai/CLIP.git@main
+
+git+https://github.com/TMElyralab/controlnet_aux.git@tme
+git+https://github.com/TMElyralab/diffusers.git@tme
+git+https://github.com/TMElyralab/MMCM.git@main
+
+numpy==1.23.5
--- a/scripts/gradio/app.py
+++ b/scripts/gradio/app.py
@ -4,12 +4,14 @@ import pdb

 import cuid
 import gradio as gr
-
+import spaces
+import numpy as np

 from huggingface_hub import snapshot_download

 ProjectDir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../.."))
 CheckpointsDir = os.path.join(ProjectDir, "checkpoints")
+ignore_video2video = False


 def download_model():
@ -28,17 +30,91 @@ def download_model():


 download_model()  # for huggingface deployment.
-
-from gradio_video2video import online_v2v_inference
+if not ignore_video2video:
+    from gradio_video2video import online_v2v_inference
 from gradio_text2video import online_t2v_inference


-def update_shape(image):
-    if image != None:
-        h, w, _ = image.shape
+@spaces.GPU(duration=180)
+def hf_online_t2v_inference(
+    prompt,
+    image_np,
+    seed,
+    fps,
+    w,
+    h,
+    video_len,
+    img_edge_ratio,
+):
+    if not isinstance(image_np, np.ndarray):  # None
+        raise gr.Error("Need input reference image")
+    return online_t2v_inference(
+        prompt, image_np, seed, fps, w, h, video_len, img_edge_ratio
+    )
+
+
+@spaces.GPU(duration=180)
+def hg_online_v2v_inference(
+    prompt,
+    image_np,
+    video,
+    processor,
+    seed,
+    fps,
+    w,
+    h,
+    video_length,
+    img_edge_ratio,
+):
+    if not isinstance(image_np, np.ndarray):  # None
+        raise gr.Error("Need input reference image")
+    return online_v2v_inference(
+        prompt,
+        image_np,
+        video,
+        processor,
+        seed,
+        fps,
+        w,
+        h,
+        video_length,
+        img_edge_ratio,
+    )
+
+
+def limit_shape(image, input_w, input_h, img_edge_ratio, max_image_edge=960):
+    """limite generation video shape to avoid gpu memory overflow"""
+    if isinstance(image, np.ndarray) and (input_h == -1 and input_w == -1):
+        input_h, input_w, _ = image.shape
+    h, w, _ = image.shape
+    if img_edge_ratio == 0:
+        img_edge_ratio = 1
+    img_edge_ratio_infact = min(max_image_edge / max(input_h, input_w), img_edge_ratio)
+    # print(
+    #     image.shape,
+    #     input_w,
+    #     input_h,
+    #     img_edge_ratio,
+    #     max_image_edge,
+    #     img_edge_ratio_infact,
+    # )
+    if img_edge_ratio != 1:
+        return (
+            img_edge_ratio_infact,
+            input_w * img_edge_ratio_infact,
+            input_h * img_edge_ratio_infact,
+        )
    else:
-        h, w = 768, 512
-    return w, h
+        return img_edge_ratio_infact, -1, -1
+
+
+def limit_length(length):
+    """limite generation video frames numer to avoid gpu memory overflow"""
+
+    if length > 24 * 6:
+        gr.Warning("Length need to smaller than 144, dute to gpu memory limit")
+        length = 24 * 6
+    return length


 class ConcatenateBlock(gr.blocks.Block):
@ -121,97 +197,179 @@ with gr.Blocks(css=css) as demo:
            with gr.Column():
                prompt = gr.Textbox(label="Prompt")
                image = gr.Image(label="VisionCondImage")
-                gr.Markdown("seed=-1 means that the seeds run each time are different")
-                seed = gr.Number(label="Seed", value=-1)
-                video_length = gr.Number(label="Video Length", value=12)
+                seed = gr.Number(
+                    label="Seed (seed=-1 means that the seeds run each time are different)",
+                    value=-1,
+                )
+                video_length = gr.Number(
+                    label="Video Length(need smaller than 144,If you want to be able to generate longer videos, run it locally )",
+                    value=12,
+                )
                fps = gr.Number(label="Generate Video FPS", value=6)
                gr.Markdown(
                    (
                        "If W&H is -1, then use the Reference Image's Size. Size of target video is $(W, H)*img\_edge\_ratio$. \n"
                        "The shorter the image size, the larger the motion amplitude, and the lower video quality.\n"
-                        "The longer the W&H, the smaller the motion amplitude, and the higher video quality"
+                        "The longer the W&H, the smaller the motion amplitude, and the higher video quality.\n"
+                        "Due to the GPU VRAM limits, the W&H need smaller than 960px"
                    )
                )
                with gr.Row():
                    w = gr.Number(label="Width", value=-1)
                    h = gr.Number(label="Height", value=-1)
                    img_edge_ratio = gr.Number(label="img_edge_ratio", value=1.0)
-
+                with gr.Row():
+                    out_w = gr.Number(label="Output Width", value=0, interactive=False)
+                    out_h = gr.Number(label="Output Height", value=0, interactive=False)
+                    img_edge_ratio_infact = gr.Number(
+                        label="img_edge_ratio in fact",
+                        value=1.0,
+                        interactive=False,
+                    )
                btn1 = gr.Button("Generate")
-            out = gr.outputs.Video()
+            out = gr.Video()
            # pdb.set_trace()
+        i2v_examples_256 = [
+            [
+                "(masterpiece, best quality, highres:1),(1boy, solo:1),(eye blinks:1.8),(head wave:1.3)",
+                "../../data/images/yongen.jpeg",
+            ],
+            [
+                "(masterpiece, best quality, highres:1),(1man, solo:1),(eye blinks:1.8),(head wave:1.3)",
+                "../../data/images/The-Laughing-Cavalier.jpg",
+            ],
+        ]
        with gr.Row():
-            board = gr.Dataframe(
-                value=[["", "", ""]] * 3,
-                interactive=False,
-                type="array",
-                label="Demo Video",
+            gr.Examples(
+                examples=i2v_examples_256,
+                inputs=[prompt, image],
+                outputs=[out],
+                fn=hf_online_t2v_inference,
+                cache_examples=False,
            )
-
-        # image.change(fn=update_shape, inputs=[image], outputs=[w, h])
-
-        btn1.click(
-            fn=online_t2v_inference,
-            inputs=[prompt, image, seed, fps, w, h, video_length, img_edge_ratio],
-            outputs=out,
+        img_edge_ratio.change(
+            fn=limit_shape,
+            inputs=[image, w, h, img_edge_ratio],
+            outputs=[img_edge_ratio_infact, out_w, out_h],
        )

-    with gr.Tab("Video to Video"):
-        with gr.Row():
-            with gr.Column():
-                prompt = gr.Textbox(label="Prompt")
-                gr.Markdown(
-                    (
-                        "pose of VisionCondImage should be same as of the first frame of the video. "
-                        "its better generate target first frame whose pose is same as of first frame of the video with text2image tool, sch as MJ, SDXL."
-                    )
-                )
-                image = gr.Image(label="VisionCondImage")
-                video = gr.Video(label="ReferVideo")
-                # radio = gr.inputs.Radio(, label="Select an option")
-                # ctr_button = gr.inputs.Button(label="Add ControlNet List")
-                # output_text = gr.outputs.Textbox()
-                processor = gr.Textbox(
-                    label=f"Control Condition. gradio code now only support dwpose_body_hand, use command can support multi of {control_options}",
-                    value="dwpose_body_hand",
-                )
-                gr.Markdown("seed=-1 means that seeds are different in every run")
-                seed = gr.Number(label="Seed", value=-1)
-                video_length = gr.Number(label="Video Length", value=12)
-                fps = gr.Number(label="Generate Video FPS", value=6)
-                gr.Markdown(
-                    (
-                        "If W&H is -1, then use the Reference Image's Size. Size of target video is $(W, H)*img\_edge\_ratio$. \n"
-                        "The shorter the image size, the larger the motion amplitude, and the lower video quality. \n"
-                        "The longer the W&H, the smaller the motion amplitude, and the higher video quality. "
-                    )
-                )
-                with gr.Row():
-                    w = gr.Number(label="Width", value=-1)
-                    h = gr.Number(label="Height", value=-1)
-                    img_edge_ratio = gr.Number(label="img_edge_ratio", value=1.0)
+        video_length.change(
+            fn=limit_length, inputs=[video_length], outputs=[video_length]
+        )

-                btn2 = gr.Button("Generate")
-            out1 = gr.outputs.Video()
-        # image.change(fn=update_shape, inputs=[image], outputs=[w, h])
-
-        btn2.click(
-            fn=online_v2v_inference,
+        btn1.click(
+            fn=hf_online_t2v_inference,
            inputs=[
                prompt,
                image,
-                video,
-                processor,
                seed,
                fps,
                w,
                h,
                video_length,
-                img_edge_ratio,
+                img_edge_ratio_infact,
            ],
-            outputs=out1,
+            outputs=out,
        )

+    with gr.Tab("Video to Video"):
+        if ignore_video2video:
+            gr.Markdown(
+                (
+                    "Due to GPU limit, MuseVDemo now only support Text2Video. If you want to try Video2Video, please run it locally. \n"
+                    "We are trying to support video2video in the future. Thanks for your understanding."
+                )
+            )
+        else:
+            with gr.Row():
+                with gr.Column():
+                    prompt = gr.Textbox(label="Prompt")
+                    gr.Markdown(
+                        (
+                            "pose of VisionCondImage should be same as of the first frame of the video. "
+                            "its better generate target first frame whose pose is same as of first frame of the video with text2image tool, sch as MJ, SDXL."
+                        )
+                    )
+                    image = gr.Image(label="VisionCondImage")
+                    video = gr.Video(label="ReferVideo")
+                    # radio = gr.inputs.Radio(, label="Select an option")
+                    # ctr_button = gr.inputs.Button(label="Add ControlNet List")
+                    # output_text = gr.outputs.Textbox()
+                    processor = gr.Textbox(
+                        label=f"Control Condition. gradio code now only support dwpose_body_hand, use command can support multi of {control_options}",
+                        value="dwpose_body_hand",
+                    )
+                    gr.Markdown("seed=-1 means that seeds are different in every run")
+                    seed = gr.Number(
+                        label="Seed (seed=-1 means that the seeds run each time are different)",
+                        value=-1,
+                    )
+                    video_length = gr.Number(label="Video Length", value=12)
+                    fps = gr.Number(label="Generate Video FPS", value=6)
+                    gr.Markdown(
+                        (
+                            "If W&H is -1, then use the Reference Image's Size. Size of target video is $(W, H)*img\_edge\_ratio$. \n"
+                            "The shorter the image size, the larger the motion amplitude, and the lower video quality.\n"
+                            "The longer the W&H, the smaller the motion amplitude, and the higher video quality.\n"
+                            "Due to the GPU VRAM limits, the W&H need smaller than 2000px"
+                        )
+                    )
+                    with gr.Row():
+                        w = gr.Number(label="Width", value=-1)
+                        h = gr.Number(label="Height", value=-1)
+                        img_edge_ratio = gr.Number(label="img_edge_ratio", value=1.0)
+
+                    with gr.Row():
+                        out_w = gr.Number(label="Width", value=0, interactive=False)
+                        out_h = gr.Number(label="Height", value=0, interactive=False)
+                        img_edge_ratio_infact = gr.Number(
+                            label="img_edge_ratio in fact",
+                            value=1.0,
+                            interactive=False,
+                        )
+                    btn2 = gr.Button("Generate")
+                out1 = gr.Video()
+
+            v2v_examples_256 = [
+                [
+                    "(masterpiece, best quality, highres:1), harley quinn is dancing, animation, by joshua klein",
+                    "../../data/demo/cyber_girl.png",
+                    "../../data/demo/video1.mp4",
+                ],
+            ]
+            with gr.Row():
+                gr.Examples(
+                    examples=v2v_examples_256,
+                    inputs=[prompt, image, video],
+                    outputs=[out],
+                    fn=hg_online_v2v_inference,
+                    cache_examples=False,
+                )
+            img_edge_ratio.change(
+                fn=limit_shape,
+                inputs=[image, w, h, img_edge_ratio],
+                outputs=[img_edge_ratio_infact, out_w, out_h],
+            )
+            video_length.change(
+                fn=limit_length, inputs=[video_length], outputs=[video_length]
+            )
+            btn2.click(
+                fn=hg_online_v2v_inference,
+                inputs=[
+                    prompt,
+                    image,
+                    video,
+                    processor,
+                    seed,
+                    fps,
+                    w,
+                    h,
+                    video_length,
+                    img_edge_ratio_infact,
+                ],
+                outputs=out1,
+            )
+

 # Set the IP and port
 ip_address = "0.0.0.0"  # Replace with your desired IP address
@ -219,5 +377,5 @@ port_number = 7860  # Replace with your desired port number


 demo.queue().launch(
-    share=False, debug=True, server_name=ip_address, server_port=port_number
+    share=True, debug=True, server_name=ip_address, server_port=port_number
 )
--- a/scripts/gradio/app_docker_space.py
+++ b/scripts/gradio/app_docker_space.py
@ -0,0 +1,381 @@
+import os
+import time
+import pdb
+
+import cuid
+import gradio as gr
+import spaces
+import numpy as np
+
+from huggingface_hub import snapshot_download
+
+ProjectDir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../.."))
+CheckpointsDir = os.path.join(ProjectDir, "checkpoints")
+ignore_video2video = True
+
+
+def download_model():
+    if not os.path.exists(CheckpointsDir):
+        print("Checkpoint Not Downloaded, start downloading...")
+        tic = time.time()
+        snapshot_download(
+            repo_id="TMElyralab/MuseV",
+            local_dir=CheckpointsDir,
+            max_workers=8,
+        )
+        toc = time.time()
+        print(f"download cost {toc-tic} seconds")
+    else:
+        print("Already download the model.")
+
+
+download_model()  # for huggingface deployment.
+if not ignore_video2video:
+    from gradio_video2video import online_v2v_inference
+from gradio_text2video import online_t2v_inference
+
+
+@spaces.GPU(duration=180)
+def hf_online_t2v_inference(
+    prompt,
+    image_np,
+    seed,
+    fps,
+    w,
+    h,
+    video_len,
+    img_edge_ratio,
+):
+    if not isinstance(image_np, np.ndarray):  # None
+        raise gr.Error("Need input reference image")
+    return online_t2v_inference(
+        prompt, image_np, seed, fps, w, h, video_len, img_edge_ratio
+    )
+
+
+@spaces.GPU(duration=180)
+def hg_online_v2v_inference(
+    prompt,
+    image_np,
+    video,
+    processor,
+    seed,
+    fps,
+    w,
+    h,
+    video_length,
+    img_edge_ratio,
+):
+    if not isinstance(image_np, np.ndarray):  # None
+        raise gr.Error("Need input reference image")
+    return online_v2v_inference(
+        prompt,
+        image_np,
+        video,
+        processor,
+        seed,
+        fps,
+        w,
+        h,
+        video_length,
+        img_edge_ratio,
+    )
+
+
+def limit_shape(image, input_w, input_h, img_edge_ratio, max_image_edge=960):
+    """limite generation video shape to avoid gpu memory overflow"""
+    if isinstance(image, np.ndarray) and (input_h == -1 and input_w == -1):
+        input_h, input_w, _ = image.shape
+    h, w, _ = image.shape
+    if img_edge_ratio == 0:
+        img_edge_ratio = 1
+    img_edge_ratio_infact = min(max_image_edge / max(input_h, input_w), img_edge_ratio)
+    # print(
+    #     image.shape,
+    #     input_w,
+    #     input_h,
+    #     img_edge_ratio,
+    #     max_image_edge,
+    #     img_edge_ratio_infact,
+    # )
+    if img_edge_ratio != 1:
+        return (
+            img_edge_ratio_infact,
+            input_w * img_edge_ratio_infact,
+            input_h * img_edge_ratio_infact,
+        )
+    else:
+        return img_edge_ratio_infact, -1, -1
+
+
+def limit_length(length):
+    """limite generation video frames numer to avoid gpu memory overflow"""
+
+    if length > 24 * 6:
+        gr.Warning("Length need to smaller than 144, dute to gpu memory limit")
+        length = 24 * 6
+    return length
+
+
+class ConcatenateBlock(gr.blocks.Block):
+    def __init__(self, options):
+        self.options = options
+        self.current_string = ""
+
+    def update_string(self, new_choice):
+        if new_choice and new_choice not in self.current_string.split(", "):
+            if self.current_string == "":
+                self.current_string = new_choice
+            else:
+                self.current_string += ", " + new_choice
+        return self.current_string
+
+
+def process_input(new_choice):
+    return concatenate_block.update_string(new_choice), ""
+
+
+control_options = [
+    "pose",
+    "pose_body",
+    "pose_hand",
+    "pose_face",
+    "pose_hand_body",
+    "pose_hand_face",
+    "dwpose",
+    "dwpose_face",
+    "dwpose_hand",
+    "dwpose_body",
+    "dwpose_body_hand",
+    "canny",
+    "tile",
+    "hed",
+    "hed_scribble",
+    "depth",
+    "pidi",
+    "normal_bae",
+    "lineart",
+    "lineart_anime",
+    "zoe",
+    "sam",
+    "mobile_sam",
+    "leres",
+    "content",
+    "face_detector",
+]
+concatenate_block = ConcatenateBlock(control_options)
+
+
+css = """#input_img {max-width: 1024px !important} #output_vid {max-width: 1024px; max-height: 576px}"""
+
+
+with gr.Blocks(css=css) as demo:
+    gr.Markdown(
+        "<div align='center'> <h1> MuseV: Infinite-length and High Fidelity Virtual Human Video Generation with Visual Conditioned Parallel Denoising</span> </h1> \
+                    <h2 style='font-weight: 450; font-size: 1rem; margin: 0rem'>\
+                    </br>\
+                    Zhiqiang Xia <sup>*</sup>,\
+                    Zhaokang Chen<sup>*</sup>,\
+                    Bin Wu<sup>†</sup>,\
+                    Chao Li,\
+                    Kwok-Wai Hung,\
+                    Chao Zhan,\
+                    Yingjie He,\
+                    Wenjiang Zhou\
+                    (<sup>*</sup>Equal Contribution,  <sup>†</sup>Corresponding Author, benbinwu@tencent.com)\
+                    </br>\
+                    Lyra Lab, Tencent Music Entertainment\
+                </h2> \
+                <a style='font-size:18px;color: #000000' href='https://github.com/TMElyralab/MuseV'>[Github Repo]</a>\
+                <a style='font-size:18px;color: #000000'>, which is important to Open-Source projects. Thanks!</a>\
+                <a style='font-size:18px;color: #000000' href=''> [ArXiv(Coming Soon)] </a>\
+                <a style='font-size:18px;color: #000000' href=''> [Project Page(Coming Soon)] </a> \
+                <a style='font-size:18px;color: #000000'>If MuseV is useful, please help star the repo~ </a> </div>"
+    )
+    with gr.Tab("Text to Video"):
+        with gr.Row():
+            with gr.Column():
+                prompt = gr.Textbox(label="Prompt")
+                image = gr.Image(label="VisionCondImage")
+                seed = gr.Number(
+                    label="Seed (seed=-1 means that the seeds run each time are different)",
+                    value=-1,
+                )
+                video_length = gr.Number(
+                    label="Video Length(need smaller than 144,If you want to be able to generate longer videos, run it locally )",
+                    value=12,
+                )
+                fps = gr.Number(label="Generate Video FPS", value=6)
+                gr.Markdown(
+                    (
+                        "If W&H is -1, then use the Reference Image's Size. Size of target video is $(W, H)*img\_edge\_ratio$. \n"
+                        "The shorter the image size, the larger the motion amplitude, and the lower video quality.\n"
+                        "The longer the W&H, the smaller the motion amplitude, and the higher video quality.\n"
+                        "Due to the GPU VRAM limits, the W&H need smaller than 960px"
+                    )
+                )
+                with gr.Row():
+                    w = gr.Number(label="Width", value=-1)
+                    h = gr.Number(label="Height", value=-1)
+                    img_edge_ratio = gr.Number(label="img_edge_ratio", value=1.0)
+                with gr.Row():
+                    out_w = gr.Number(label="Output Width", value=0, interactive=False)
+                    out_h = gr.Number(label="Output Height", value=0, interactive=False)
+                    img_edge_ratio_infact = gr.Number(
+                        label="img_edge_ratio in fact",
+                        value=1.0,
+                        interactive=False,
+                    )
+                btn1 = gr.Button("Generate")
+            out = gr.Video()
+            # pdb.set_trace()
+        i2v_examples_256 = [
+            [
+                "(masterpiece, best quality, highres:1),(1boy, solo:1),(eye blinks:1.8),(head wave:1.3)",
+                "../../data/images/yongen.jpeg",
+            ],
+            [
+                "(masterpiece, best quality, highres:1),(1man, solo:1),(eye blinks:1.8),(head wave:1.3)",
+                "../../data/images/The-Laughing-Cavalier.jpg",
+            ],
+        ]
+        with gr.Row():
+            gr.Examples(
+                examples=i2v_examples_256,
+                inputs=[prompt, image],
+                outputs=[out],
+                fn=hf_online_t2v_inference,
+                cache_examples=False,
+            )
+        img_edge_ratio.change(
+            fn=limit_shape,
+            inputs=[image, w, h, img_edge_ratio],
+            outputs=[img_edge_ratio_infact, out_w, out_h],
+        )
+
+        video_length.change(
+            fn=limit_length, inputs=[video_length], outputs=[video_length]
+        )
+
+        btn1.click(
+            fn=hf_online_t2v_inference,
+            inputs=[
+                prompt,
+                image,
+                seed,
+                fps,
+                w,
+                h,
+                video_length,
+                img_edge_ratio_infact,
+            ],
+            outputs=out,
+        )
+
+    with gr.Tab("Video to Video"):
+        if ignore_video2video:
+            gr.Markdown(
+                (
+                    "Due to GPU limit, MuseVDemo now only support Text2Video. If you want to try Video2Video, please run it locally. \n"
+                    "We are trying to support video2video in the future. Thanks for your understanding."
+                )
+            )
+        else:
+            with gr.Row():
+                with gr.Column():
+                    prompt = gr.Textbox(label="Prompt")
+                    gr.Markdown(
+                        (
+                            "pose of VisionCondImage should be same as of the first frame of the video. "
+                            "its better generate target first frame whose pose is same as of first frame of the video with text2image tool, sch as MJ, SDXL."
+                        )
+                    )
+                    image = gr.Image(label="VisionCondImage")
+                    video = gr.Video(label="ReferVideo")
+                    # radio = gr.inputs.Radio(, label="Select an option")
+                    # ctr_button = gr.inputs.Button(label="Add ControlNet List")
+                    # output_text = gr.outputs.Textbox()
+                    processor = gr.Textbox(
+                        label=f"Control Condition. gradio code now only support dwpose_body_hand, use command can support multi of {control_options}",
+                        value="dwpose_body_hand",
+                    )
+                    gr.Markdown("seed=-1 means that seeds are different in every run")
+                    seed = gr.Number(
+                        label="Seed (seed=-1 means that the seeds run each time are different)",
+                        value=-1,
+                    )
+                    video_length = gr.Number(label="Video Length", value=12)
+                    fps = gr.Number(label="Generate Video FPS", value=6)
+                    gr.Markdown(
+                        (
+                            "If W&H is -1, then use the Reference Image's Size. Size of target video is $(W, H)*img\_edge\_ratio$. \n"
+                            "The shorter the image size, the larger the motion amplitude, and the lower video quality.\n"
+                            "The longer the W&H, the smaller the motion amplitude, and the higher video quality.\n"
+                            "Due to the GPU VRAM limits, the W&H need smaller than 2000px"
+                        )
+                    )
+                    with gr.Row():
+                        w = gr.Number(label="Width", value=-1)
+                        h = gr.Number(label="Height", value=-1)
+                        img_edge_ratio = gr.Number(label="img_edge_ratio", value=1.0)
+
+                    with gr.Row():
+                        out_w = gr.Number(label="Width", value=0, interactive=False)
+                        out_h = gr.Number(label="Height", value=0, interactive=False)
+                        img_edge_ratio_infact = gr.Number(
+                            label="img_edge_ratio in fact",
+                            value=1.0,
+                            interactive=False,
+                        )
+                    btn2 = gr.Button("Generate")
+                out1 = gr.Video()
+
+            v2v_examples_256 = [
+                [
+                    "(masterpiece, best quality, highres:1), harley quinn is dancing, animation, by joshua klein",
+                    "../../data/demo/cyber_girl.png",
+                    "../../data/demo/video1.mp4",
+                ],
+            ]
+            with gr.Row():
+                gr.Examples(
+                    examples=v2v_examples_256,
+                    inputs=[prompt, image, video],
+                    outputs=[out],
+                    fn=hg_online_v2v_inference,
+                    cache_examples=False,
+                )
+            img_edge_ratio.change(
+                fn=limit_shape,
+                inputs=[image, w, h, img_edge_ratio],
+                outputs=[img_edge_ratio_infact, out_w, out_h],
+            )
+            video_length.change(
+                fn=limit_length, inputs=[video_length], outputs=[video_length]
+            )
+            btn2.click(
+                fn=hg_online_v2v_inference,
+                inputs=[
+                    prompt,
+                    image,
+                    video,
+                    processor,
+                    seed,
+                    fps,
+                    w,
+                    h,
+                    video_length,
+                    img_edge_ratio_infact,
+                ],
+                outputs=out1,
+            )
+
+
+# Set the IP and port
+ip_address = "0.0.0.0"  # Replace with your desired IP address
+port_number = 7860  # Replace with your desired port number
+
+
+demo.queue().launch(
+    share=True, debug=True, server_name=ip_address, server_port=port_number
+)
--- a/scripts/gradio/app_gradio_space.py
+++ b/scripts/gradio/app_gradio_space.py
@ -0,0 +1,416 @@
+import os
+import time
+import pdb
+
+import cuid
+import gradio as gr
+import spaces
+import numpy as np
+import sys
+
+from huggingface_hub import snapshot_download
+import subprocess
+
+
+ProjectDir = os.path.abspath(os.path.dirname(__file__))
+CheckpointsDir = os.path.join(ProjectDir, "checkpoints")
+
+sys.path.insert(0, ProjectDir)
+sys.path.insert(0, f"{ProjectDir}/MMCM")
+sys.path.insert(0, f"{ProjectDir}/diffusers/src")
+sys.path.insert(0, f"{ProjectDir}/controlnet_aux/src")
+sys.path.insert(0, f"{ProjectDir}/scripts/gradio")
+
+result = subprocess.run(
+    ["pip", "install", "--no-cache-dir", "-U", "openmim"],
+    capture_output=True,
+    text=True,
+)
+print(result)
+
+result = subprocess.run(["mim", "install", "mmengine"], capture_output=True, text=True)
+print(result)
+
+result = subprocess.run(
+    ["mim", "install", "mmcv>=2.0.1"], capture_output=True, text=True
+)
+print(result)
+
+result = subprocess.run(
+    ["mim", "install", "mmdet>=3.1.0"], capture_output=True, text=True
+)
+print(result)
+
+result = subprocess.run(
+    ["mim", "install", "mmpose>=1.1.0"], capture_output=True, text=True
+)
+print(result)
+ignore_video2video = True
+
+
+def download_model():
+    if not os.path.exists(CheckpointsDir):
+        print("Checkpoint Not Downloaded, start downloading...")
+        tic = time.time()
+        snapshot_download(
+            repo_id="TMElyralab/MuseV",
+            local_dir=CheckpointsDir,
+            max_workers=8,
+            local_dir_use_symlinks=True,
+        )
+        toc = time.time()
+        print(f"download cost {toc-tic} seconds")
+    else:
+        print("Already download the model.")
+
+
+download_model()  # for huggingface deployment.
+if not ignore_video2video:
+    from gradio_video2video import online_v2v_inference
+from gradio_text2video import online_t2v_inference
+
+
+@spaces.GPU(duration=180)
+def hf_online_t2v_inference(
+    prompt,
+    image_np,
+    seed,
+    fps,
+    w,
+    h,
+    video_len,
+    img_edge_ratio,
+):
+    if not isinstance(image_np, np.ndarray):  # None
+        raise gr.Error("Need input reference image")
+    return online_t2v_inference(
+        prompt, image_np, seed, fps, w, h, video_len, img_edge_ratio
+    )
+
+
+@spaces.GPU(duration=180)
+def hg_online_v2v_inference(
+    prompt,
+    image_np,
+    video,
+    processor,
+    seed,
+    fps,
+    w,
+    h,
+    video_length,
+    img_edge_ratio,
+):
+    if not isinstance(image_np, np.ndarray):  # None
+        raise gr.Error("Need input reference image")
+    return online_v2v_inference(
+        prompt,
+        image_np,
+        video,
+        processor,
+        seed,
+        fps,
+        w,
+        h,
+        video_length,
+        img_edge_ratio,
+    )
+
+
+def limit_shape(image, input_w, input_h, img_edge_ratio, max_image_edge=960):
+    """limite generation video shape to avoid gpu memory overflow"""
+    if isinstance(image, np.ndarray) and (input_h == -1 and input_w == -1):
+        input_h, input_w, _ = image.shape
+    h, w, _ = image.shape
+    if img_edge_ratio == 0:
+        img_edge_ratio = 1
+    img_edge_ratio_infact = min(max_image_edge / max(input_h, input_w), img_edge_ratio)
+    # print(
+    #     image.shape,
+    #     input_w,
+    #     input_h,
+    #     img_edge_ratio,
+    #     max_image_edge,
+    #     img_edge_ratio_infact,
+    # )
+    if img_edge_ratio != 1:
+        return (
+            img_edge_ratio_infact,
+            input_w * img_edge_ratio_infact,
+            input_h * img_edge_ratio_infact,
+        )
+    else:
+        return img_edge_ratio_infact, -1, -1
+
+
+def limit_length(length):
+    """limite generation video frames numer to avoid gpu memory overflow"""
+
+    if length > 24 * 6:
+        gr.Warning("Length need to smaller than 144, dute to gpu memory limit")
+        length = 24 * 6
+    return length
+
+
+class ConcatenateBlock(gr.blocks.Block):
+    def __init__(self, options):
+        self.options = options
+        self.current_string = ""
+
+    def update_string(self, new_choice):
+        if new_choice and new_choice not in self.current_string.split(", "):
+            if self.current_string == "":
+                self.current_string = new_choice
+            else:
+                self.current_string += ", " + new_choice
+        return self.current_string
+
+
+def process_input(new_choice):
+    return concatenate_block.update_string(new_choice), ""
+
+
+control_options = [
+    "pose",
+    "pose_body",
+    "pose_hand",
+    "pose_face",
+    "pose_hand_body",
+    "pose_hand_face",
+    "dwpose",
+    "dwpose_face",
+    "dwpose_hand",
+    "dwpose_body",
+    "dwpose_body_hand",
+    "canny",
+    "tile",
+    "hed",
+    "hed_scribble",
+    "depth",
+    "pidi",
+    "normal_bae",
+    "lineart",
+    "lineart_anime",
+    "zoe",
+    "sam",
+    "mobile_sam",
+    "leres",
+    "content",
+    "face_detector",
+]
+concatenate_block = ConcatenateBlock(control_options)
+
+
+css = """#input_img {max-width: 1024px !important} #output_vid {max-width: 1024px; max-height: 576px}"""
+
+
+with gr.Blocks(css=css) as demo:
+    gr.Markdown(
+        "<div align='center'> <h1> MuseV: Infinite-length and High Fidelity Virtual Human Video Generation with Visual Conditioned Parallel Denoising</span> </h1> \
+                    <h2 style='font-weight: 450; font-size: 1rem; margin: 0rem'>\
+                    </br>\
+                    Zhiqiang Xia <sup>*</sup>,\
+                    Zhaokang Chen<sup>*</sup>,\
+                    Bin Wu<sup>†</sup>,\
+                    Chao Li,\
+                    Kwok-Wai Hung,\
+                    Chao Zhan,\
+                    Yingjie He,\
+                    Wenjiang Zhou\
+                    (<sup>*</sup>Equal Contribution,  <sup>†</sup>Corresponding Author, benbinwu@tencent.com)\
+                    </br>\
+                    Lyra Lab, Tencent Music Entertainment\
+                </h2> \
+                <a style='font-size:18px;color: #000000' href='https://github.com/TMElyralab/MuseV'>[Github Repo]</a>\
+                <a style='font-size:18px;color: #000000'>, which is important to Open-Source projects. Thanks!</a>\
+                <a style='font-size:18px;color: #000000' href=''> [ArXiv(Coming Soon)] </a>\
+                <a style='font-size:18px;color: #000000' href=''> [Project Page(Coming Soon)] </a> \
+                <a style='font-size:18px;color: #000000'>If MuseV is useful, please help star the repo~ </a> </div>"
+    )
+    with gr.Tab("Text to Video"):
+        with gr.Row():
+            with gr.Column():
+                prompt = gr.Textbox(label="Prompt")
+                image = gr.Image(label="VisionCondImage")
+                seed = gr.Number(
+                    label="Seed (seed=-1 means that the seeds run each time are different)",
+                    value=-1,
+                )
+                video_length = gr.Number(
+                    label="Video Length(need smaller than 144,If you want to be able to generate longer videos, run it locally )",
+                    value=12,
+                )
+                fps = gr.Number(label="Generate Video FPS", value=6)
+                gr.Markdown(
+                    (
+                        "If W&H is -1, then use the Reference Image's Size. Size of target video is $(W, H)*img\_edge\_ratio$. \n"
+                        "The shorter the image size, the larger the motion amplitude, and the lower video quality.\n"
+                        "The longer the W&H, the smaller the motion amplitude, and the higher video quality.\n"
+                        "Due to the GPU VRAM limits, the W&H need smaller than 960px"
+                    )
+                )
+                with gr.Row():
+                    w = gr.Number(label="Width", value=-1)
+                    h = gr.Number(label="Height", value=-1)
+                    img_edge_ratio = gr.Number(label="img_edge_ratio", value=1.0)
+                with gr.Row():
+                    out_w = gr.Number(label="Output Width", value=0, interactive=False)
+                    out_h = gr.Number(label="Output Height", value=0, interactive=False)
+                    img_edge_ratio_infact = gr.Number(
+                        label="img_edge_ratio in fact",
+                        value=1.0,
+                        interactive=False,
+                    )
+                btn1 = gr.Button("Generate")
+            out = gr.Video()
+            # pdb.set_trace()
+        i2v_examples_256 = [
+            [
+                "(masterpiece, best quality, highres:1),(1boy, solo:1),(eye blinks:1.8),(head wave:1.3)",
+                "../../data/images/yongen.jpeg",
+            ],
+            [
+                "(masterpiece, best quality, highres:1),(1man, solo:1),(eye blinks:1.8),(head wave:1.3)",
+                "../../data/images/The-Laughing-Cavalier.jpg",
+            ],
+        ]
+        with gr.Row():
+            gr.Examples(
+                examples=i2v_examples_256,
+                inputs=[prompt, image],
+                outputs=[out],
+                fn=hf_online_t2v_inference,
+                cache_examples=False,
+            )
+        img_edge_ratio.change(
+            fn=limit_shape,
+            inputs=[image, w, h, img_edge_ratio],
+            outputs=[img_edge_ratio_infact, out_w, out_h],
+        )
+
+        video_length.change(
+            fn=limit_length, inputs=[video_length], outputs=[video_length]
+        )
+
+        btn1.click(
+            fn=hf_online_t2v_inference,
+            inputs=[
+                prompt,
+                image,
+                seed,
+                fps,
+                w,
+                h,
+                video_length,
+                img_edge_ratio_infact,
+            ],
+            outputs=out,
+        )
+
+    with gr.Tab("Video to Video"):
+        if ignore_video2video:
+            gr.Markdown(
+                (
+                    "Due to GPU limit, MuseVDemo now only support Text2Video. If you want to try Video2Video, please run it locally. \n"
+                    "We are trying to support video2video in the future. Thanks for your understanding."
+                )
+            )
+        else:
+            with gr.Row():
+                with gr.Column():
+                    prompt = gr.Textbox(label="Prompt")
+                    gr.Markdown(
+                        (
+                            "pose of VisionCondImage should be same as of the first frame of the video. "
+                            "its better generate target first frame whose pose is same as of first frame of the video with text2image tool, sch as MJ, SDXL."
+                        )
+                    )
+                    image = gr.Image(label="VisionCondImage")
+                    video = gr.Video(label="ReferVideo")
+                    # radio = gr.inputs.Radio(, label="Select an option")
+                    # ctr_button = gr.inputs.Button(label="Add ControlNet List")
+                    # output_text = gr.outputs.Textbox()
+                    processor = gr.Textbox(
+                        label=f"Control Condition. gradio code now only support dwpose_body_hand, use command can support multi of {control_options}",
+                        value="dwpose_body_hand",
+                    )
+                    gr.Markdown("seed=-1 means that seeds are different in every run")
+                    seed = gr.Number(
+                        label="Seed (seed=-1 means that the seeds run each time are different)",
+                        value=-1,
+                    )
+                    video_length = gr.Number(label="Video Length", value=12)
+                    fps = gr.Number(label="Generate Video FPS", value=6)
+                    gr.Markdown(
+                        (
+                            "If W&H is -1, then use the Reference Image's Size. Size of target video is $(W, H)*img\_edge\_ratio$. \n"
+                            "The shorter the image size, the larger the motion amplitude, and the lower video quality.\n"
+                            "The longer the W&H, the smaller the motion amplitude, and the higher video quality.\n"
+                            "Due to the GPU VRAM limits, the W&H need smaller than 2000px"
+                        )
+                    )
+                    with gr.Row():
+                        w = gr.Number(label="Width", value=-1)
+                        h = gr.Number(label="Height", value=-1)
+                        img_edge_ratio = gr.Number(label="img_edge_ratio", value=1.0)
+
+                    with gr.Row():
+                        out_w = gr.Number(label="Width", value=0, interactive=False)
+                        out_h = gr.Number(label="Height", value=0, interactive=False)
+                        img_edge_ratio_infact = gr.Number(
+                            label="img_edge_ratio in fact",
+                            value=1.0,
+                            interactive=False,
+                        )
+                    btn2 = gr.Button("Generate")
+                out1 = gr.Video()
+
+            v2v_examples_256 = [
+                [
+                    "(masterpiece, best quality, highres:1), harley quinn is dancing, animation, by joshua klein",
+                    "../../data/demo/cyber_girl.png",
+                    "../../data/demo/video1.mp4",
+                ],
+            ]
+            with gr.Row():
+                gr.Examples(
+                    examples=v2v_examples_256,
+                    inputs=[prompt, image, video],
+                    outputs=[out],
+                    fn=hg_online_v2v_inference,
+                    cache_examples=False,
+                )
+            img_edge_ratio.change(
+                fn=limit_shape,
+                inputs=[image, w, h, img_edge_ratio],
+                outputs=[img_edge_ratio_infact, out_w, out_h],
+            )
+            video_length.change(
+                fn=limit_length, inputs=[video_length], outputs=[video_length]
+            )
+            btn2.click(
+                fn=hg_online_v2v_inference,
+                inputs=[
+                    prompt,
+                    image,
+                    video,
+                    processor,
+                    seed,
+                    fps,
+                    w,
+                    h,
+                    video_length,
+                    img_edge_ratio_infact,
+                ],
+                outputs=out1,
+            )
+
+
+# Set the IP and port
+ip_address = "0.0.0.0"  # Replace with your desired IP address
+port_number = 7860  # Replace with your desired port number
+
+
+demo.queue().launch(
+    share=True, debug=True, server_name=ip_address, server_port=port_number
+)
--- a/scripts/gradio/app_space.py
+++ b/scripts/gradio/app_space.py
@ -1,223 +0,0 @@
-import os
-import time
-import pdb
-
-import cuid
-import gradio as gr
-
-
-from huggingface_hub import snapshot_download
-
-ProjectDir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../.."))
-CheckpointsDir = os.path.join(ProjectDir, "checkpoints")
-
-
-def download_model():
-    if not os.path.exists(CheckpointsDir):
-        print("Checkpoint Not Downloaded, start downloading...")
-        tic = time.time()
-        snapshot_download(
-            repo_id="TMElyralab/MuseV",
-            local_dir=CheckpointsDir,
-            max_workers=8,
-        )
-        toc = time.time()
-        print(f"download cost {toc-tic} seconds")
-    else:
-        print("Already download the model.")
-
-
-download_model()  # for huggingface deployment.
-
-from gradio_video2video import online_v2v_inference
-from gradio_text2video import online_t2v_inference
-
-
-def update_shape(image):
-    if image != None:
-        h, w, _ = image.shape
-    else:
-        h, w = 768, 512
-    return w, h
-
-
-class ConcatenateBlock(gr.blocks.Block):
-    def __init__(self, options):
-        self.options = options
-        self.current_string = ""
-
-    def update_string(self, new_choice):
-        if new_choice and new_choice not in self.current_string.split(", "):
-            if self.current_string == "":
-                self.current_string = new_choice
-            else:
-                self.current_string += ", " + new_choice
-        return self.current_string
-
-
-def process_input(new_choice):
-    return concatenate_block.update_string(new_choice), ""
-
-
-control_options = [
-    "pose",
-    "pose_body",
-    "pose_hand",
-    "pose_face",
-    "pose_hand_body",
-    "pose_hand_face",
-    "dwpose",
-    "dwpose_face",
-    "dwpose_hand",
-    "dwpose_body",
-    "dwpose_body_hand",
-    "canny",
-    "tile",
-    "hed",
-    "hed_scribble",
-    "depth",
-    "pidi",
-    "normal_bae",
-    "lineart",
-    "lineart_anime",
-    "zoe",
-    "sam",
-    "mobile_sam",
-    "leres",
-    "content",
-    "face_detector",
-]
-concatenate_block = ConcatenateBlock(control_options)
-
-
-css = """#input_img {max-width: 1024px !important} #output_vid {max-width: 1024px; max-height: 576px}"""
-
-
-with gr.Blocks(css=css) as demo:
-    gr.Markdown(
-        "<div align='center'> <h1> MuseV: Infinite-length and High Fidelity Virtual Human Video Generation with Visual Conditioned Parallel Denoising</span> </h1> \
-                    <h2 style='font-weight: 450; font-size: 1rem; margin: 0rem'>\
-                    </br>\
-                    Zhiqiang Xia <sup>*</sup>,\
-                    Zhaokang Chen<sup>*</sup>,\
-                    Bin Wu<sup>†</sup>,\
-                    Chao Li,\
-                    Kwok-Wai Hung,\
-                    Chao Zhan,\
-                    Yingjie He,\
-                    Wenjiang Zhou\
-                    (<sup>*</sup>Equal Contribution,  <sup>†</sup>Corresponding Author, benbinwu@tencent.com)\
-                    </br>\
-                    Lyra Lab, Tencent Music Entertainment\
-                </h2> \
-                <a style='font-size:18px;color: #000000' href='https://github.com/TMElyralab/MuseV'>[Github Repo]</a>\
-                <a style='font-size:18px;color: #000000'>, which is important to Open-Source projects. Thanks!</a>\
-                <a style='font-size:18px;color: #000000' href=''> [ArXiv(Coming Soon)] </a>\
-                <a style='font-size:18px;color: #000000' href=''> [Project Page(Coming Soon)] </a> \
-                <a style='font-size:18px;color: #000000'>If MuseV is useful, please help star the repo~ </a> </div>"
-    )
-    with gr.Tab("Text to Video"):
-        with gr.Row():
-            with gr.Column():
-                prompt = gr.Textbox(label="Prompt")
-                image = gr.Image(label="VisionCondImage")
-                gr.Markdown("seed=-1 means that the seeds run each time are different")
-                seed = gr.Number(label="Seed", value=-1)
-                video_length = gr.Number(label="Video Length", value=12)
-                fps = gr.Number(label="Generate Video FPS", value=6)
-                gr.Markdown(
-                    (
-                        "If W&H is -1, then use the Reference Image's Size. Size of target video is $(W, H)*img\_edge\_ratio$. \n"
-                        "The shorter the image size, the larger the motion amplitude, and the lower video quality.\n"
-                        "The longer the W&H, the smaller the motion amplitude, and the higher video quality"
-                    )
-                )
-                with gr.Row():
-                    w = gr.Number(label="Width", value=-1)
-                    h = gr.Number(label="Height", value=-1)
-                    img_edge_ratio = gr.Number(label="img_edge_ratio", value=1.0)
-
-                btn1 = gr.Button("Generate")
-            out = gr.outputs.Video()
-            # pdb.set_trace()
-        with gr.Row():
-            board = gr.Dataframe(
-                value=[["", "", ""]] * 3,
-                interactive=False,
-                type="array",
-                label="Demo Video",
-            )
-
-        # image.change(fn=update_shape, inputs=[image], outputs=[w, h])
-
-        btn1.click(
-            fn=online_t2v_inference,
-            inputs=[prompt, image, seed, fps, w, h, video_length, img_edge_ratio],
-            outputs=out,
-        )
-
-    with gr.Tab("Video to Video"):
-        with gr.Row():
-            with gr.Column():
-                prompt = gr.Textbox(label="Prompt")
-                gr.Markdown(
-                    (
-                        "pose of VisionCondImage should be same as of the first frame of the video. "
-                        "its better generate target first frame whose pose is same as of first frame of the video with text2image tool, sch as MJ, SDXL."
-                    )
-                )
-                image = gr.Image(label="VisionCondImage")
-                video = gr.Video(label="ReferVideo")
-                # radio = gr.inputs.Radio(, label="Select an option")
-                # ctr_button = gr.inputs.Button(label="Add ControlNet List")
-                # output_text = gr.outputs.Textbox()
-                processor = gr.Textbox(
-                    label=f"Control Condition. gradio code now only support dwpose_body_hand, use command can support multi of {control_options}",
-                    value="dwpose_body_hand",
-                )
-                gr.Markdown("seed=-1 means that seeds are different in every run")
-                seed = gr.Number(label="Seed", value=-1)
-                video_length = gr.Number(label="Video Length", value=12)
-                fps = gr.Number(label="Generate Video FPS", value=6)
-                gr.Markdown(
-                    (
-                        "If W&H is -1, then use the Reference Image's Size. Size of target video is $(W, H)*img\_edge\_ratio$. \n"
-                        "The shorter the image size, the larger the motion amplitude, and the lower video quality. \n"
-                        "The longer the W&H, the smaller the motion amplitude, and the higher video quality. "
-                    )
-                )
-                with gr.Row():
-                    w = gr.Number(label="Width", value=-1)
-                    h = gr.Number(label="Height", value=-1)
-                    img_edge_ratio = gr.Number(label="img_edge_ratio", value=1.0)
-
-                btn2 = gr.Button("Generate")
-            out1 = gr.outputs.Video()
-        # image.change(fn=update_shape, inputs=[image], outputs=[w, h])
-
-        btn2.click(
-            fn=online_v2v_inference,
-            inputs=[
-                prompt,
-                image,
-                video,
-                processor,
-                seed,
-                fps,
-                w,
-                h,
-                video_length,
-                img_edge_ratio,
-            ],
-            outputs=out1,
-        )
-
-
-# Set the IP and port
-ip_address = "0.0.0.0"  # Replace with your desired IP address
-port_number = 7860  # Replace with your desired port number
-
-
-demo.queue().launch(
-    share=False, debug=True, server_name=ip_address, server_port=port_number
-)
--- a/scripts/gradio/gradio_text2video.py
+++ b/scripts/gradio/gradio_text2video.py
@ -45,10 +45,8 @@ from musev.models.unet_loader import load_unet_by_name
 from musev.utils.util import save_videos_grid_with_opencv
 from musev import logger

-need_load_predictor = False
-if need_load_predictor:
-    video_sd_predictor = None
-else:
+use_v2v_predictor = False
+if use_v2v_predictor:
    from gradio_video2video import sd_predictor as video_sd_predictor

 logger.setLevel("INFO")
@ -464,7 +462,7 @@ def read_image_and_name(path):
    return images, name


-if referencenet_model_name is not None and need_load_predictor:
+if referencenet_model_name is not None and not use_v2v_predictor:
    referencenet = load_referencenet_by_name(
        model_name=referencenet_model_name,
        # sd_model=sd_model_path,
@ -476,7 +474,7 @@ else:
    referencenet = None
    referencenet_model_name = "no"

-if vision_clip_extractor_class_name is not None and need_load_predictor:
+if vision_clip_extractor_class_name is not None and not use_v2v_predictor:
    vision_clip_extractor = load_vision_clip_encoder_by_name(
        ip_image_encoder=vision_clip_model_path,
        vision_clip_extractor_class_name=vision_clip_extractor_class_name,
@ -488,7 +486,7 @@ else:
    vision_clip_extractor = None
    logger.info(f"vision_clip_extractor, None")

-if ip_adapter_model_name is not None and need_load_predictor:
+if ip_adapter_model_name is not None and not use_v2v_predictor:
    ip_adapter_image_proj = load_ip_adapter_image_proj_by_name(
        model_name=ip_adapter_model_name,
        ip_image_encoder=ip_adapter_model_params_dict.get(
@ -526,11 +524,11 @@ for model_name, sd_model_params in sd_model_params_dict.items():
            strict=not (facein_model_name is not None),
            need_t2i_ip_adapter_face=ip_adapter_face_model_name is not None,
        )
-        if need_load_predictor
+        if not use_v2v_predictor
        else None
    )

-    if facein_model_name is not None and need_load_predictor:
+    if facein_model_name is not None and not use_v2v_predictor:
        (
            face_emb_extractor,
            facein_image_proj,
@ -552,7 +550,7 @@ for model_name, sd_model_params in sd_model_params_dict.items():
        face_emb_extractor = None
        facein_image_proj = None

-    if ip_adapter_face_model_name is not None and need_load_predictor:
+    if ip_adapter_face_model_name is not None and not use_v2v_predictor:
        (
            ip_adapter_face_emb_extractor,
            ip_adapter_face_image_proj,
@ -595,10 +593,10 @@ for model_name, sd_model_params in sd_model_params_dict.items():
            ip_adapter_face_emb_extractor=ip_adapter_face_emb_extractor,
            ip_adapter_face_image_proj=ip_adapter_face_image_proj,
        )
-        if need_load_predictor
+        if not use_v2v_predictor
        else video_sd_predictor
    )
-    if not need_load_predictor:
+    if use_v2v_predictor:
        print(
            "text2video use video_sd_predictor, sd_predictor type is ",
            type(sd_predictor),