update gradio (#55)

* fix limitation of gpu vram

* aad time limitation

* reshape logci

* <feature> add example and limitation of input

* update gradio scripts and requirement

---------

Co-authored-by: trumpzhan <trumpzhan@tencent.com>
This commit is contained in:
xzqjack 2024-04-10 16:08:45 +08:00 committed by GitHub
parent 07fa407dc5
commit 504d705db4
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
14 changed files with 1109 additions and 346 deletions

View File

@ -14,6 +14,5 @@ RUN . /opt/conda/etc/profile.d/conda.sh \
&& echo "source activate musev" >> ~/.bashrc \
&& conda activate musev \
&& conda env list \
&& pip install cuid
&& pip --no-cache-dir install cuid gradio==4.12 spaces
USER root

View File

@ -65,8 +65,7 @@ Wenjiang Zhou
<td >
<video src="https://github.com/TMElyralab/MuseV/assets/163980830/732cf1fd-25e7-494e-b462-969c9425d277" width="100" controls preload></video>
</td>
<td>(masterpiece, best quality, highres:1),(1girl, solo:1),(beautiful face,
soft skin, costume:1),(eye blinks:{eye_blinks_factor}),(head wave:1.3)
<td>(masterpiece, best quality, highres:1),(1boy, solo:1),(eye blinks:1.8),(head wave:1.3)
</td>
</tr>
@ -159,8 +158,7 @@ Wenjiang Zhou
<video src="https://github.com/TMElyralab/MuseV/assets/163980830/28294baa-b996-420f-b1fb-046542adf87d" width="100" controls preload></video>
</td>
<td>
(masterpiece, best quality, highres:1),(1man, solo:1),(beautiful face,
soft skin, costume:1),(eye blinks:{eye_blinks_factor}),(head wave:1.3)
(masterpiece, best quality, highres:1),(1man, solo:1),(eye blinks:1.8),(head wave:1.3),Chinese ink painting style
</td>
</tr>
@ -184,8 +182,7 @@ Wenjiang Zhou
<video src="https://github.com/TMElyralab/MuseV/assets/163980830/4072410a-ecea-4ee5-a9b4-735f9f462d51" width="100" controls preload></video>
</td>
<td>
(masterpiece, best quality, highres:1),(1man, solo:1),(beautiful face,
soft skin, costume:1),(eye blinks:{eye_blinks_factor}),(head wave:1.3)
(masterpiece, best quality, highres:1),(1man, solo:1),(eye blinks:1.8),(head wave:1.3)
</td>
</tr>
<tr>
@ -196,8 +193,7 @@ Wenjiang Zhou
<video src="https://github.com/TMElyralab/MuseV/assets/163980830/5148beda-a1e1-44f0-ad84-2fb99ad73a11" width="100" controls preload></video>
</td>
<td>
(masterpiece, best quality, highres:1),(1girl, solo:1),(beautiful face,
soft skin, costume:1),(eye blinks:{eye_blinks_factor}),(head wave:1.3)
(masterpiece, best quality, highres:1),(1man, solo:1),(eye blinks:1.8),(head wave:1.3), animate
</td>
</tr>
<tr>
@ -283,7 +279,7 @@ Wenjiang Zhou
<video src="https://github.com/TMElyralab/MuseV/assets/163980830/484cc69d-c316-4464-a55b-3df929780a8e" width="400" controls preload></video>
</td>
<td>
(masterpiece, best quality, highres:1)
(masterpiece, best quality, highres:1) , a girl is dancing, animation
</td>
</tr>
<tr>
@ -294,7 +290,7 @@ Wenjiang Zhou
<video src="https://github.com/TMElyralab/MuseV/assets/163980830/c44682e6-aafc-4730-8fc1-72825c1bacf2" width="400" controls preload></video>
</td>
<td>
(masterpiece, best quality, highres:1)
(masterpiece, best quality, highres:1), is dancing, animation
</td>
</tr>
</table >
@ -374,7 +370,7 @@ pip install -r requirements.txt
#### 准备 [openmmlab](https://openmmlab.com/) 包
如果不使用 Docker方式还需要额外安装 mmlab 包。
```bash
pip install--no-cache-dir -U openmim
pip install --no-cache-dir -U openmim
mim install mmengine
mim install "mmcv>=2.0.1"
mim install "mmdet>=3.1.0"

View File

@ -65,8 +65,7 @@ Examples bellow can be accessed at `configs/tasks/example.yaml`
<td >
<video src="https://github.com/TMElyralab/MuseV/assets/163980830/732cf1fd-25e7-494e-b462-969c9425d277" width="100" controls preload></video>
</td>
<td>(masterpiece, best quality, highres:1),(1girl, solo:1),(beautiful face,
soft skin, costume:1),(eye blinks:{eye_blinks_factor}),(head wave:1.3)
<td>(masterpiece, best quality, highres:1),(1boy, solo:1),(eye blinks:1.8),(head wave:1.3)
</td>
</tr>
@ -159,8 +158,7 @@ Examples bellow can be accessed at `configs/tasks/example.yaml`
<video src="https://github.com/TMElyralab/MuseV/assets/163980830/28294baa-b996-420f-b1fb-046542adf87d" width="100" controls preload></video>
</td>
<td>
(masterpiece, best quality, highres:1),(1man, solo:1),(beautiful face,
soft skin, costume:1),(eye blinks:{eye_blinks_factor}),(head wave:1.3)
(masterpiece, best quality, highres:1),(1man, solo:1),(eye blinks:1.8),(head wave:1.3),Chinese ink painting style
</td>
</tr>
@ -184,8 +182,7 @@ Examples bellow can be accessed at `configs/tasks/example.yaml`
<video src="https://github.com/TMElyralab/MuseV/assets/163980830/4072410a-ecea-4ee5-a9b4-735f9f462d51" width="100" controls preload></video>
</td>
<td>
(masterpiece, best quality, highres:1),(1man, solo:1),(beautiful face,
soft skin, costume:1),(eye blinks:{eye_blinks_factor}),(head wave:1.3)
(masterpiece, best quality, highres:1),(1man, solo:1),(eye blinks:1.8),(head wave:1.3)
</td>
</tr>
<tr>
@ -196,8 +193,7 @@ Examples bellow can be accessed at `configs/tasks/example.yaml`
<video src="https://github.com/TMElyralab/MuseV/assets/163980830/5148beda-a1e1-44f0-ad84-2fb99ad73a11" width="100" controls preload></video>
</td>
<td>
(masterpiece, best quality, highres:1),(1girl, solo:1),(beautiful face,
soft skin, costume:1),(eye blinks:{eye_blinks_factor}),(head wave:1.3)
(masterpiece, best quality, highres:1),(1man, solo:1),(eye blinks:1.8),(head wave:1.3), animate
</td>
</tr>
<tr>
@ -279,7 +275,7 @@ In `duffy` mode, pose of the vision condition frame is not aligned with the firs
<video src="https://github.com/TMElyralab/MuseV/assets/163980830/484cc69d-c316-4464-a55b-3df929780a8e" width="400" controls preload></video>
</td>
<td>
(masterpiece, best quality, highres:1)
(masterpiece, best quality, highres:1) , a girl is dancing, animation
</td>
</tr>
<tr>
@ -290,7 +286,7 @@ In `duffy` mode, pose of the vision condition frame is not aligned with the firs
<video src="https://github.com/TMElyralab/MuseV/assets/163980830/c44682e6-aafc-4730-8fc1-72825c1bacf2" width="400" controls preload></video>
</td>
<td>
(masterpiece, best quality, highres:1)
(masterpiece, best quality, highres:1), is dancing, animation
</td>
</tr>
</table >
@ -370,7 +366,7 @@ pip install -r requirements.txt
#### Prepare mmlab package
if not use docker, should install mmlab package additionally.
```bash
pip install--no-cache-dir -U openmim
pip install --no-cache-dir -U openmim
mim install mmengine
mim install "mmcv>=2.0.1"
mim install "mmdet>=3.1.0"

View File

@ -15,8 +15,7 @@
img_length_ratio: 0.957
ipadapter_image: ${.condition_images}
name: yongen
prompt: (masterpiece, best quality, highres:1),(1girl, solo:1),(beautiful face,
soft skin, costume:1),(eye blinks:{eye_blinks_factor}),(head wave:1.3)
prompt: (masterpiece, best quality, highres:1),(1boy, solo:1),(eye blinks:1.8),(head wave:1.3)
refer_image: ${.condition_images}
video_path: null
width: 736
@ -97,8 +96,7 @@
img_length_ratio: 1.495
ipadapter_image: ${.condition_images}
name: dufu
prompt: (masterpiece, best quality, highres:1),(1girl, solo:1),(beautiful face,
soft skin, costume:1),(eye blinks:{eye_blinks_factor}),(head wave:1.3)
prompt: (masterpiece, best quality, highres:1),(1man, solo:1),(eye blinks:1.8),(head wave:1.3),Chinese ink painting style
refer_image: ${.condition_images}
video_path: null
width: 471
@ -119,8 +117,7 @@
img_length_ratio: 0.88
ipadapter_image: ${.condition_images}
name: Portrait-of-Dr.-Gachet
prompt: (masterpiece, best quality, highres:1),(1girl, solo:1),(beautiful face,
soft skin, costume:1),(eye blinks:{eye_blinks_factor}),(head wave:1.3)
prompt: (masterpiece, best quality, highres:1),(1man, solo:1),(eye blinks:1.8),(head wave:1.3)
refer_image: ${.condition_images}
video_path: null
width: 800
@ -130,8 +127,7 @@
img_length_ratio: 1.246
ipadapter_image: ${.condition_images}
name: Self-Portrait-with-Cropped-Hair
prompt: (masterpiece, best quality, highres:1),(1girl, solo:1),(beautiful face,
soft skin, costume:1),(eye blinks:{eye_blinks_factor}),(head wave:1.3)
prompt: (masterpiece, best quality, highres:1),(1boy, solo:1),(eye blinks:1.8),(head wave:1.3), animate
refer_image: ${.condition_images}
video_path: null
width: 848
@ -141,8 +137,7 @@
img_length_ratio: 0.587
ipadapter_image: ${.condition_images}
name: The-Laughing-Cavalier
prompt: (masterpiece, best quality, highres:1),(1girl, solo:1),(beautiful face,
soft skin, costume:1),(eye blinks:{eye_blinks_factor}),(head wave:1.3)
prompt: (masterpiece, best quality, highres:1),(1man, solo:1),(eye blinks:1.8),(head wave:1.3)
refer_image: ${.condition_images}
video_path: null
width: 1200

@ -1 +1 @@
Subproject commit 54a432af35f4f264f4a8361c7387fa8092c2dd7f
Subproject commit 54c6c49baf68bff290679f5bb896715f25932133

BIN
data/demo/cyber_girl.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 110 KiB

BIN
data/demo/video1.mp4 Normal file

Binary file not shown.

View File

@ -452,6 +452,43 @@ class DiffusersPipelinePredictor(object):
2. when input paramter is None, use text2video to generate vis cond image, and use as refer_image and ip_adapter_image too.
3. given from input paramter, but still redraw, update with redrawn vis cond image.
"""
# crop resize images
if condition_images is not None:
logger.debug(
f"center crop resize condition_images={condition_images.shape}, to height={height}, width={width}"
)
condition_images = batch_dynamic_crop_resize_images_v2(
condition_images,
target_height=height,
target_width=width,
)
if refer_image is not None:
logger.debug(
f"center crop resize refer_image to height={height}, width={width}"
)
refer_image = batch_dynamic_crop_resize_images_v2(
refer_image,
target_height=height,
target_width=width,
)
if ip_adapter_image is not None:
logger.debug(
f"center crop resize ip_adapter_image to height={height}, width={width}"
)
ip_adapter_image = batch_dynamic_crop_resize_images_v2(
ip_adapter_image,
target_height=height,
target_width=width,
)
if refer_face_image is not None:
logger.debug(
f"center crop resize refer_face_image to height={height}, width={width}"
)
refer_face_image = batch_dynamic_crop_resize_images_v2(
refer_face_image,
target_height=height,
target_width=width,
)
run_video_length = video_length
# generate vision condition frame start
# if condition_images is None, generate with refer_image, ip_adapter_image

View File

@ -1,6 +1,6 @@
tensorflow==2.12.0
tensorboard==2.12.0
# tensorflow==2.12.0
# tensorboard==2.12.0
# torch==2.0.1+cu118
# torchvision==0.15.2+cu118
@ -50,18 +50,17 @@ requests
scipy
six
tqdm
gradio==3.43.2
gradio==4.12
albumentations==1.3.1
opencv-contrib-python==4.8.0.76
imageio-ffmpeg==0.4.8
pytorch-lightning==2.0.8
test-tube==0.7.5
timm
timm==0.9.12
addict
yapf
prettytable
safetensors==0.3.3
basicsr
fvcore
pycocotools
wandb==0.15.10
@ -88,5 +87,16 @@ IProgress==0.4
markupsafe==2.0.1
xlsxwriter
cuid
spaces
# https://mirrors.cloud.tencent.com/pypi/packages/de/a6/a49d5af79a515f5c9552a26b2078d839c40fcf8dccc0d94a1269276ab181/tb_nightly-2.1.0a20191022-py3-none-any.whl
basicsr
git+https://github.com/tencent-ailab/IP-Adapter.git@main
git+https://github.com/openai/CLIP.git@main
git+https://github.com/TMElyralab/controlnet_aux.git@tme
git+https://github.com/TMElyralab/diffusers.git@tme
git+https://github.com/TMElyralab/MMCM.git@main
numpy==1.23.5

View File

@ -4,12 +4,14 @@ import pdb
import cuid
import gradio as gr
import spaces
import numpy as np
from huggingface_hub import snapshot_download
ProjectDir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../.."))
CheckpointsDir = os.path.join(ProjectDir, "checkpoints")
ignore_video2video = False
def download_model():
@ -28,17 +30,91 @@ def download_model():
download_model() # for huggingface deployment.
from gradio_video2video import online_v2v_inference
if not ignore_video2video:
from gradio_video2video import online_v2v_inference
from gradio_text2video import online_t2v_inference
def update_shape(image):
if image != None:
h, w, _ = image.shape
@spaces.GPU(duration=180)
def hf_online_t2v_inference(
prompt,
image_np,
seed,
fps,
w,
h,
video_len,
img_edge_ratio,
):
if not isinstance(image_np, np.ndarray): # None
raise gr.Error("Need input reference image")
return online_t2v_inference(
prompt, image_np, seed, fps, w, h, video_len, img_edge_ratio
)
@spaces.GPU(duration=180)
def hg_online_v2v_inference(
prompt,
image_np,
video,
processor,
seed,
fps,
w,
h,
video_length,
img_edge_ratio,
):
if not isinstance(image_np, np.ndarray): # None
raise gr.Error("Need input reference image")
return online_v2v_inference(
prompt,
image_np,
video,
processor,
seed,
fps,
w,
h,
video_length,
img_edge_ratio,
)
def limit_shape(image, input_w, input_h, img_edge_ratio, max_image_edge=960):
"""limite generation video shape to avoid gpu memory overflow"""
if isinstance(image, np.ndarray) and (input_h == -1 and input_w == -1):
input_h, input_w, _ = image.shape
h, w, _ = image.shape
if img_edge_ratio == 0:
img_edge_ratio = 1
img_edge_ratio_infact = min(max_image_edge / max(input_h, input_w), img_edge_ratio)
# print(
# image.shape,
# input_w,
# input_h,
# img_edge_ratio,
# max_image_edge,
# img_edge_ratio_infact,
# )
if img_edge_ratio != 1:
return (
img_edge_ratio_infact,
input_w * img_edge_ratio_infact,
input_h * img_edge_ratio_infact,
)
else:
h, w = 768, 512
return w, h
return img_edge_ratio_infact, -1, -1
def limit_length(length):
"""limite generation video frames numer to avoid gpu memory overflow"""
if length > 24 * 6:
gr.Warning("Length need to smaller than 144, dute to gpu memory limit")
length = 24 * 6
return length
class ConcatenateBlock(gr.blocks.Block):
@ -121,97 +197,179 @@ with gr.Blocks(css=css) as demo:
with gr.Column():
prompt = gr.Textbox(label="Prompt")
image = gr.Image(label="VisionCondImage")
gr.Markdown("seed=-1 means that the seeds run each time are different")
seed = gr.Number(label="Seed", value=-1)
video_length = gr.Number(label="Video Length", value=12)
seed = gr.Number(
label="Seed (seed=-1 means that the seeds run each time are different)",
value=-1,
)
video_length = gr.Number(
label="Video Length(need smaller than 144,If you want to be able to generate longer videos, run it locally )",
value=12,
)
fps = gr.Number(label="Generate Video FPS", value=6)
gr.Markdown(
(
"If W&H is -1, then use the Reference Image's Size. Size of target video is $(W, H)*img\_edge\_ratio$. \n"
"The shorter the image size, the larger the motion amplitude, and the lower video quality.\n"
"The longer the W&H, the smaller the motion amplitude, and the higher video quality"
"The longer the W&H, the smaller the motion amplitude, and the higher video quality.\n"
"Due to the GPU VRAM limits, the W&H need smaller than 960px"
)
)
with gr.Row():
w = gr.Number(label="Width", value=-1)
h = gr.Number(label="Height", value=-1)
img_edge_ratio = gr.Number(label="img_edge_ratio", value=1.0)
with gr.Row():
out_w = gr.Number(label="Output Width", value=0, interactive=False)
out_h = gr.Number(label="Output Height", value=0, interactive=False)
img_edge_ratio_infact = gr.Number(
label="img_edge_ratio in fact",
value=1.0,
interactive=False,
)
btn1 = gr.Button("Generate")
out = gr.outputs.Video()
out = gr.Video()
# pdb.set_trace()
i2v_examples_256 = [
[
"(masterpiece, best quality, highres:1),(1boy, solo:1),(eye blinks:1.8),(head wave:1.3)",
"../../data/images/yongen.jpeg",
],
[
"(masterpiece, best quality, highres:1),(1man, solo:1),(eye blinks:1.8),(head wave:1.3)",
"../../data/images/The-Laughing-Cavalier.jpg",
],
]
with gr.Row():
board = gr.Dataframe(
value=[["", "", ""]] * 3,
interactive=False,
type="array",
label="Demo Video",
gr.Examples(
examples=i2v_examples_256,
inputs=[prompt, image],
outputs=[out],
fn=hf_online_t2v_inference,
cache_examples=False,
)
# image.change(fn=update_shape, inputs=[image], outputs=[w, h])
btn1.click(
fn=online_t2v_inference,
inputs=[prompt, image, seed, fps, w, h, video_length, img_edge_ratio],
outputs=out,
img_edge_ratio.change(
fn=limit_shape,
inputs=[image, w, h, img_edge_ratio],
outputs=[img_edge_ratio_infact, out_w, out_h],
)
with gr.Tab("Video to Video"):
with gr.Row():
with gr.Column():
prompt = gr.Textbox(label="Prompt")
gr.Markdown(
(
"pose of VisionCondImage should be same as of the first frame of the video. "
"its better generate target first frame whose pose is same as of first frame of the video with text2image tool, sch as MJ, SDXL."
)
)
image = gr.Image(label="VisionCondImage")
video = gr.Video(label="ReferVideo")
# radio = gr.inputs.Radio(, label="Select an option")
# ctr_button = gr.inputs.Button(label="Add ControlNet List")
# output_text = gr.outputs.Textbox()
processor = gr.Textbox(
label=f"Control Condition. gradio code now only support dwpose_body_hand, use command can support multi of {control_options}",
value="dwpose_body_hand",
)
gr.Markdown("seed=-1 means that seeds are different in every run")
seed = gr.Number(label="Seed", value=-1)
video_length = gr.Number(label="Video Length", value=12)
fps = gr.Number(label="Generate Video FPS", value=6)
gr.Markdown(
(
"If W&H is -1, then use the Reference Image's Size. Size of target video is $(W, H)*img\_edge\_ratio$. \n"
"The shorter the image size, the larger the motion amplitude, and the lower video quality. \n"
"The longer the W&H, the smaller the motion amplitude, and the higher video quality. "
)
)
with gr.Row():
w = gr.Number(label="Width", value=-1)
h = gr.Number(label="Height", value=-1)
img_edge_ratio = gr.Number(label="img_edge_ratio", value=1.0)
video_length.change(
fn=limit_length, inputs=[video_length], outputs=[video_length]
)
btn2 = gr.Button("Generate")
out1 = gr.outputs.Video()
# image.change(fn=update_shape, inputs=[image], outputs=[w, h])
btn2.click(
fn=online_v2v_inference,
btn1.click(
fn=hf_online_t2v_inference,
inputs=[
prompt,
image,
video,
processor,
seed,
fps,
w,
h,
video_length,
img_edge_ratio,
img_edge_ratio_infact,
],
outputs=out1,
outputs=out,
)
with gr.Tab("Video to Video"):
if ignore_video2video:
gr.Markdown(
(
"Due to GPU limit, MuseVDemo now only support Text2Video. If you want to try Video2Video, please run it locally. \n"
"We are trying to support video2video in the future. Thanks for your understanding."
)
)
else:
with gr.Row():
with gr.Column():
prompt = gr.Textbox(label="Prompt")
gr.Markdown(
(
"pose of VisionCondImage should be same as of the first frame of the video. "
"its better generate target first frame whose pose is same as of first frame of the video with text2image tool, sch as MJ, SDXL."
)
)
image = gr.Image(label="VisionCondImage")
video = gr.Video(label="ReferVideo")
# radio = gr.inputs.Radio(, label="Select an option")
# ctr_button = gr.inputs.Button(label="Add ControlNet List")
# output_text = gr.outputs.Textbox()
processor = gr.Textbox(
label=f"Control Condition. gradio code now only support dwpose_body_hand, use command can support multi of {control_options}",
value="dwpose_body_hand",
)
gr.Markdown("seed=-1 means that seeds are different in every run")
seed = gr.Number(
label="Seed (seed=-1 means that the seeds run each time are different)",
value=-1,
)
video_length = gr.Number(label="Video Length", value=12)
fps = gr.Number(label="Generate Video FPS", value=6)
gr.Markdown(
(
"If W&H is -1, then use the Reference Image's Size. Size of target video is $(W, H)*img\_edge\_ratio$. \n"
"The shorter the image size, the larger the motion amplitude, and the lower video quality.\n"
"The longer the W&H, the smaller the motion amplitude, and the higher video quality.\n"
"Due to the GPU VRAM limits, the W&H need smaller than 2000px"
)
)
with gr.Row():
w = gr.Number(label="Width", value=-1)
h = gr.Number(label="Height", value=-1)
img_edge_ratio = gr.Number(label="img_edge_ratio", value=1.0)
with gr.Row():
out_w = gr.Number(label="Width", value=0, interactive=False)
out_h = gr.Number(label="Height", value=0, interactive=False)
img_edge_ratio_infact = gr.Number(
label="img_edge_ratio in fact",
value=1.0,
interactive=False,
)
btn2 = gr.Button("Generate")
out1 = gr.Video()
v2v_examples_256 = [
[
"(masterpiece, best quality, highres:1), harley quinn is dancing, animation, by joshua klein",
"../../data/demo/cyber_girl.png",
"../../data/demo/video1.mp4",
],
]
with gr.Row():
gr.Examples(
examples=v2v_examples_256,
inputs=[prompt, image, video],
outputs=[out],
fn=hg_online_v2v_inference,
cache_examples=False,
)
img_edge_ratio.change(
fn=limit_shape,
inputs=[image, w, h, img_edge_ratio],
outputs=[img_edge_ratio_infact, out_w, out_h],
)
video_length.change(
fn=limit_length, inputs=[video_length], outputs=[video_length]
)
btn2.click(
fn=hg_online_v2v_inference,
inputs=[
prompt,
image,
video,
processor,
seed,
fps,
w,
h,
video_length,
img_edge_ratio_infact,
],
outputs=out1,
)
# Set the IP and port
ip_address = "0.0.0.0" # Replace with your desired IP address
@ -219,5 +377,5 @@ port_number = 7860 # Replace with your desired port number
demo.queue().launch(
share=False, debug=True, server_name=ip_address, server_port=port_number
share=True, debug=True, server_name=ip_address, server_port=port_number
)

View File

@ -0,0 +1,381 @@
import os
import time
import pdb
import cuid
import gradio as gr
import spaces
import numpy as np
from huggingface_hub import snapshot_download
ProjectDir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../.."))
CheckpointsDir = os.path.join(ProjectDir, "checkpoints")
ignore_video2video = True
def download_model():
if not os.path.exists(CheckpointsDir):
print("Checkpoint Not Downloaded, start downloading...")
tic = time.time()
snapshot_download(
repo_id="TMElyralab/MuseV",
local_dir=CheckpointsDir,
max_workers=8,
)
toc = time.time()
print(f"download cost {toc-tic} seconds")
else:
print("Already download the model.")
download_model() # for huggingface deployment.
if not ignore_video2video:
from gradio_video2video import online_v2v_inference
from gradio_text2video import online_t2v_inference
@spaces.GPU(duration=180)
def hf_online_t2v_inference(
prompt,
image_np,
seed,
fps,
w,
h,
video_len,
img_edge_ratio,
):
if not isinstance(image_np, np.ndarray): # None
raise gr.Error("Need input reference image")
return online_t2v_inference(
prompt, image_np, seed, fps, w, h, video_len, img_edge_ratio
)
@spaces.GPU(duration=180)
def hg_online_v2v_inference(
prompt,
image_np,
video,
processor,
seed,
fps,
w,
h,
video_length,
img_edge_ratio,
):
if not isinstance(image_np, np.ndarray): # None
raise gr.Error("Need input reference image")
return online_v2v_inference(
prompt,
image_np,
video,
processor,
seed,
fps,
w,
h,
video_length,
img_edge_ratio,
)
def limit_shape(image, input_w, input_h, img_edge_ratio, max_image_edge=960):
"""limite generation video shape to avoid gpu memory overflow"""
if isinstance(image, np.ndarray) and (input_h == -1 and input_w == -1):
input_h, input_w, _ = image.shape
h, w, _ = image.shape
if img_edge_ratio == 0:
img_edge_ratio = 1
img_edge_ratio_infact = min(max_image_edge / max(input_h, input_w), img_edge_ratio)
# print(
# image.shape,
# input_w,
# input_h,
# img_edge_ratio,
# max_image_edge,
# img_edge_ratio_infact,
# )
if img_edge_ratio != 1:
return (
img_edge_ratio_infact,
input_w * img_edge_ratio_infact,
input_h * img_edge_ratio_infact,
)
else:
return img_edge_ratio_infact, -1, -1
def limit_length(length):
"""limite generation video frames numer to avoid gpu memory overflow"""
if length > 24 * 6:
gr.Warning("Length need to smaller than 144, dute to gpu memory limit")
length = 24 * 6
return length
class ConcatenateBlock(gr.blocks.Block):
def __init__(self, options):
self.options = options
self.current_string = ""
def update_string(self, new_choice):
if new_choice and new_choice not in self.current_string.split(", "):
if self.current_string == "":
self.current_string = new_choice
else:
self.current_string += ", " + new_choice
return self.current_string
def process_input(new_choice):
return concatenate_block.update_string(new_choice), ""
control_options = [
"pose",
"pose_body",
"pose_hand",
"pose_face",
"pose_hand_body",
"pose_hand_face",
"dwpose",
"dwpose_face",
"dwpose_hand",
"dwpose_body",
"dwpose_body_hand",
"canny",
"tile",
"hed",
"hed_scribble",
"depth",
"pidi",
"normal_bae",
"lineart",
"lineart_anime",
"zoe",
"sam",
"mobile_sam",
"leres",
"content",
"face_detector",
]
concatenate_block = ConcatenateBlock(control_options)
css = """#input_img {max-width: 1024px !important} #output_vid {max-width: 1024px; max-height: 576px}"""
with gr.Blocks(css=css) as demo:
gr.Markdown(
"<div align='center'> <h1> MuseV: Infinite-length and High Fidelity Virtual Human Video Generation with Visual Conditioned Parallel Denoising</span> </h1> \
<h2 style='font-weight: 450; font-size: 1rem; margin: 0rem'>\
</br>\
Zhiqiang Xia <sup>*</sup>,\
Zhaokang Chen<sup>*</sup>,\
Bin Wu<sup></sup>,\
Chao Li,\
Kwok-Wai Hung,\
Chao Zhan,\
Yingjie He,\
Wenjiang Zhou\
(<sup>*</sup>Equal Contribution, <sup></sup>Corresponding Author, benbinwu@tencent.com)\
</br>\
Lyra Lab, Tencent Music Entertainment\
</h2> \
<a style='font-size:18px;color: #000000' href='https://github.com/TMElyralab/MuseV'>[Github Repo]</a>\
<a style='font-size:18px;color: #000000'>, which is important to Open-Source projects. Thanks!</a>\
<a style='font-size:18px;color: #000000' href=''> [ArXiv(Coming Soon)] </a>\
<a style='font-size:18px;color: #000000' href=''> [Project Page(Coming Soon)] </a> \
<a style='font-size:18px;color: #000000'>If MuseV is useful, please help star the repo~ </a> </div>"
)
with gr.Tab("Text to Video"):
with gr.Row():
with gr.Column():
prompt = gr.Textbox(label="Prompt")
image = gr.Image(label="VisionCondImage")
seed = gr.Number(
label="Seed (seed=-1 means that the seeds run each time are different)",
value=-1,
)
video_length = gr.Number(
label="Video Length(need smaller than 144,If you want to be able to generate longer videos, run it locally )",
value=12,
)
fps = gr.Number(label="Generate Video FPS", value=6)
gr.Markdown(
(
"If W&H is -1, then use the Reference Image's Size. Size of target video is $(W, H)*img\_edge\_ratio$. \n"
"The shorter the image size, the larger the motion amplitude, and the lower video quality.\n"
"The longer the W&H, the smaller the motion amplitude, and the higher video quality.\n"
"Due to the GPU VRAM limits, the W&H need smaller than 960px"
)
)
with gr.Row():
w = gr.Number(label="Width", value=-1)
h = gr.Number(label="Height", value=-1)
img_edge_ratio = gr.Number(label="img_edge_ratio", value=1.0)
with gr.Row():
out_w = gr.Number(label="Output Width", value=0, interactive=False)
out_h = gr.Number(label="Output Height", value=0, interactive=False)
img_edge_ratio_infact = gr.Number(
label="img_edge_ratio in fact",
value=1.0,
interactive=False,
)
btn1 = gr.Button("Generate")
out = gr.Video()
# pdb.set_trace()
i2v_examples_256 = [
[
"(masterpiece, best quality, highres:1),(1boy, solo:1),(eye blinks:1.8),(head wave:1.3)",
"../../data/images/yongen.jpeg",
],
[
"(masterpiece, best quality, highres:1),(1man, solo:1),(eye blinks:1.8),(head wave:1.3)",
"../../data/images/The-Laughing-Cavalier.jpg",
],
]
with gr.Row():
gr.Examples(
examples=i2v_examples_256,
inputs=[prompt, image],
outputs=[out],
fn=hf_online_t2v_inference,
cache_examples=False,
)
img_edge_ratio.change(
fn=limit_shape,
inputs=[image, w, h, img_edge_ratio],
outputs=[img_edge_ratio_infact, out_w, out_h],
)
video_length.change(
fn=limit_length, inputs=[video_length], outputs=[video_length]
)
btn1.click(
fn=hf_online_t2v_inference,
inputs=[
prompt,
image,
seed,
fps,
w,
h,
video_length,
img_edge_ratio_infact,
],
outputs=out,
)
with gr.Tab("Video to Video"):
if ignore_video2video:
gr.Markdown(
(
"Due to GPU limit, MuseVDemo now only support Text2Video. If you want to try Video2Video, please run it locally. \n"
"We are trying to support video2video in the future. Thanks for your understanding."
)
)
else:
with gr.Row():
with gr.Column():
prompt = gr.Textbox(label="Prompt")
gr.Markdown(
(
"pose of VisionCondImage should be same as of the first frame of the video. "
"its better generate target first frame whose pose is same as of first frame of the video with text2image tool, sch as MJ, SDXL."
)
)
image = gr.Image(label="VisionCondImage")
video = gr.Video(label="ReferVideo")
# radio = gr.inputs.Radio(, label="Select an option")
# ctr_button = gr.inputs.Button(label="Add ControlNet List")
# output_text = gr.outputs.Textbox()
processor = gr.Textbox(
label=f"Control Condition. gradio code now only support dwpose_body_hand, use command can support multi of {control_options}",
value="dwpose_body_hand",
)
gr.Markdown("seed=-1 means that seeds are different in every run")
seed = gr.Number(
label="Seed (seed=-1 means that the seeds run each time are different)",
value=-1,
)
video_length = gr.Number(label="Video Length", value=12)
fps = gr.Number(label="Generate Video FPS", value=6)
gr.Markdown(
(
"If W&H is -1, then use the Reference Image's Size. Size of target video is $(W, H)*img\_edge\_ratio$. \n"
"The shorter the image size, the larger the motion amplitude, and the lower video quality.\n"
"The longer the W&H, the smaller the motion amplitude, and the higher video quality.\n"
"Due to the GPU VRAM limits, the W&H need smaller than 2000px"
)
)
with gr.Row():
w = gr.Number(label="Width", value=-1)
h = gr.Number(label="Height", value=-1)
img_edge_ratio = gr.Number(label="img_edge_ratio", value=1.0)
with gr.Row():
out_w = gr.Number(label="Width", value=0, interactive=False)
out_h = gr.Number(label="Height", value=0, interactive=False)
img_edge_ratio_infact = gr.Number(
label="img_edge_ratio in fact",
value=1.0,
interactive=False,
)
btn2 = gr.Button("Generate")
out1 = gr.Video()
v2v_examples_256 = [
[
"(masterpiece, best quality, highres:1), harley quinn is dancing, animation, by joshua klein",
"../../data/demo/cyber_girl.png",
"../../data/demo/video1.mp4",
],
]
with gr.Row():
gr.Examples(
examples=v2v_examples_256,
inputs=[prompt, image, video],
outputs=[out],
fn=hg_online_v2v_inference,
cache_examples=False,
)
img_edge_ratio.change(
fn=limit_shape,
inputs=[image, w, h, img_edge_ratio],
outputs=[img_edge_ratio_infact, out_w, out_h],
)
video_length.change(
fn=limit_length, inputs=[video_length], outputs=[video_length]
)
btn2.click(
fn=hg_online_v2v_inference,
inputs=[
prompt,
image,
video,
processor,
seed,
fps,
w,
h,
video_length,
img_edge_ratio_infact,
],
outputs=out1,
)
# Set the IP and port
ip_address = "0.0.0.0" # Replace with your desired IP address
port_number = 7860 # Replace with your desired port number
demo.queue().launch(
share=True, debug=True, server_name=ip_address, server_port=port_number
)

View File

@ -0,0 +1,416 @@
import os
import time
import pdb
import cuid
import gradio as gr
import spaces
import numpy as np
import sys
from huggingface_hub import snapshot_download
import subprocess
ProjectDir = os.path.abspath(os.path.dirname(__file__))
CheckpointsDir = os.path.join(ProjectDir, "checkpoints")
sys.path.insert(0, ProjectDir)
sys.path.insert(0, f"{ProjectDir}/MMCM")
sys.path.insert(0, f"{ProjectDir}/diffusers/src")
sys.path.insert(0, f"{ProjectDir}/controlnet_aux/src")
sys.path.insert(0, f"{ProjectDir}/scripts/gradio")
result = subprocess.run(
["pip", "install", "--no-cache-dir", "-U", "openmim"],
capture_output=True,
text=True,
)
print(result)
result = subprocess.run(["mim", "install", "mmengine"], capture_output=True, text=True)
print(result)
result = subprocess.run(
["mim", "install", "mmcv>=2.0.1"], capture_output=True, text=True
)
print(result)
result = subprocess.run(
["mim", "install", "mmdet>=3.1.0"], capture_output=True, text=True
)
print(result)
result = subprocess.run(
["mim", "install", "mmpose>=1.1.0"], capture_output=True, text=True
)
print(result)
ignore_video2video = True
def download_model():
if not os.path.exists(CheckpointsDir):
print("Checkpoint Not Downloaded, start downloading...")
tic = time.time()
snapshot_download(
repo_id="TMElyralab/MuseV",
local_dir=CheckpointsDir,
max_workers=8,
local_dir_use_symlinks=True,
)
toc = time.time()
print(f"download cost {toc-tic} seconds")
else:
print("Already download the model.")
download_model() # for huggingface deployment.
if not ignore_video2video:
from gradio_video2video import online_v2v_inference
from gradio_text2video import online_t2v_inference
@spaces.GPU(duration=180)
def hf_online_t2v_inference(
prompt,
image_np,
seed,
fps,
w,
h,
video_len,
img_edge_ratio,
):
if not isinstance(image_np, np.ndarray): # None
raise gr.Error("Need input reference image")
return online_t2v_inference(
prompt, image_np, seed, fps, w, h, video_len, img_edge_ratio
)
@spaces.GPU(duration=180)
def hg_online_v2v_inference(
prompt,
image_np,
video,
processor,
seed,
fps,
w,
h,
video_length,
img_edge_ratio,
):
if not isinstance(image_np, np.ndarray): # None
raise gr.Error("Need input reference image")
return online_v2v_inference(
prompt,
image_np,
video,
processor,
seed,
fps,
w,
h,
video_length,
img_edge_ratio,
)
def limit_shape(image, input_w, input_h, img_edge_ratio, max_image_edge=960):
"""limite generation video shape to avoid gpu memory overflow"""
if isinstance(image, np.ndarray) and (input_h == -1 and input_w == -1):
input_h, input_w, _ = image.shape
h, w, _ = image.shape
if img_edge_ratio == 0:
img_edge_ratio = 1
img_edge_ratio_infact = min(max_image_edge / max(input_h, input_w), img_edge_ratio)
# print(
# image.shape,
# input_w,
# input_h,
# img_edge_ratio,
# max_image_edge,
# img_edge_ratio_infact,
# )
if img_edge_ratio != 1:
return (
img_edge_ratio_infact,
input_w * img_edge_ratio_infact,
input_h * img_edge_ratio_infact,
)
else:
return img_edge_ratio_infact, -1, -1
def limit_length(length):
"""limite generation video frames numer to avoid gpu memory overflow"""
if length > 24 * 6:
gr.Warning("Length need to smaller than 144, dute to gpu memory limit")
length = 24 * 6
return length
class ConcatenateBlock(gr.blocks.Block):
def __init__(self, options):
self.options = options
self.current_string = ""
def update_string(self, new_choice):
if new_choice and new_choice not in self.current_string.split(", "):
if self.current_string == "":
self.current_string = new_choice
else:
self.current_string += ", " + new_choice
return self.current_string
def process_input(new_choice):
return concatenate_block.update_string(new_choice), ""
control_options = [
"pose",
"pose_body",
"pose_hand",
"pose_face",
"pose_hand_body",
"pose_hand_face",
"dwpose",
"dwpose_face",
"dwpose_hand",
"dwpose_body",
"dwpose_body_hand",
"canny",
"tile",
"hed",
"hed_scribble",
"depth",
"pidi",
"normal_bae",
"lineart",
"lineart_anime",
"zoe",
"sam",
"mobile_sam",
"leres",
"content",
"face_detector",
]
concatenate_block = ConcatenateBlock(control_options)
css = """#input_img {max-width: 1024px !important} #output_vid {max-width: 1024px; max-height: 576px}"""
with gr.Blocks(css=css) as demo:
gr.Markdown(
"<div align='center'> <h1> MuseV: Infinite-length and High Fidelity Virtual Human Video Generation with Visual Conditioned Parallel Denoising</span> </h1> \
<h2 style='font-weight: 450; font-size: 1rem; margin: 0rem'>\
</br>\
Zhiqiang Xia <sup>*</sup>,\
Zhaokang Chen<sup>*</sup>,\
Bin Wu<sup></sup>,\
Chao Li,\
Kwok-Wai Hung,\
Chao Zhan,\
Yingjie He,\
Wenjiang Zhou\
(<sup>*</sup>Equal Contribution, <sup></sup>Corresponding Author, benbinwu@tencent.com)\
</br>\
Lyra Lab, Tencent Music Entertainment\
</h2> \
<a style='font-size:18px;color: #000000' href='https://github.com/TMElyralab/MuseV'>[Github Repo]</a>\
<a style='font-size:18px;color: #000000'>, which is important to Open-Source projects. Thanks!</a>\
<a style='font-size:18px;color: #000000' href=''> [ArXiv(Coming Soon)] </a>\
<a style='font-size:18px;color: #000000' href=''> [Project Page(Coming Soon)] </a> \
<a style='font-size:18px;color: #000000'>If MuseV is useful, please help star the repo~ </a> </div>"
)
with gr.Tab("Text to Video"):
with gr.Row():
with gr.Column():
prompt = gr.Textbox(label="Prompt")
image = gr.Image(label="VisionCondImage")
seed = gr.Number(
label="Seed (seed=-1 means that the seeds run each time are different)",
value=-1,
)
video_length = gr.Number(
label="Video Length(need smaller than 144,If you want to be able to generate longer videos, run it locally )",
value=12,
)
fps = gr.Number(label="Generate Video FPS", value=6)
gr.Markdown(
(
"If W&H is -1, then use the Reference Image's Size. Size of target video is $(W, H)*img\_edge\_ratio$. \n"
"The shorter the image size, the larger the motion amplitude, and the lower video quality.\n"
"The longer the W&H, the smaller the motion amplitude, and the higher video quality.\n"
"Due to the GPU VRAM limits, the W&H need smaller than 960px"
)
)
with gr.Row():
w = gr.Number(label="Width", value=-1)
h = gr.Number(label="Height", value=-1)
img_edge_ratio = gr.Number(label="img_edge_ratio", value=1.0)
with gr.Row():
out_w = gr.Number(label="Output Width", value=0, interactive=False)
out_h = gr.Number(label="Output Height", value=0, interactive=False)
img_edge_ratio_infact = gr.Number(
label="img_edge_ratio in fact",
value=1.0,
interactive=False,
)
btn1 = gr.Button("Generate")
out = gr.Video()
# pdb.set_trace()
i2v_examples_256 = [
[
"(masterpiece, best quality, highres:1),(1boy, solo:1),(eye blinks:1.8),(head wave:1.3)",
"../../data/images/yongen.jpeg",
],
[
"(masterpiece, best quality, highres:1),(1man, solo:1),(eye blinks:1.8),(head wave:1.3)",
"../../data/images/The-Laughing-Cavalier.jpg",
],
]
with gr.Row():
gr.Examples(
examples=i2v_examples_256,
inputs=[prompt, image],
outputs=[out],
fn=hf_online_t2v_inference,
cache_examples=False,
)
img_edge_ratio.change(
fn=limit_shape,
inputs=[image, w, h, img_edge_ratio],
outputs=[img_edge_ratio_infact, out_w, out_h],
)
video_length.change(
fn=limit_length, inputs=[video_length], outputs=[video_length]
)
btn1.click(
fn=hf_online_t2v_inference,
inputs=[
prompt,
image,
seed,
fps,
w,
h,
video_length,
img_edge_ratio_infact,
],
outputs=out,
)
with gr.Tab("Video to Video"):
if ignore_video2video:
gr.Markdown(
(
"Due to GPU limit, MuseVDemo now only support Text2Video. If you want to try Video2Video, please run it locally. \n"
"We are trying to support video2video in the future. Thanks for your understanding."
)
)
else:
with gr.Row():
with gr.Column():
prompt = gr.Textbox(label="Prompt")
gr.Markdown(
(
"pose of VisionCondImage should be same as of the first frame of the video. "
"its better generate target first frame whose pose is same as of first frame of the video with text2image tool, sch as MJ, SDXL."
)
)
image = gr.Image(label="VisionCondImage")
video = gr.Video(label="ReferVideo")
# radio = gr.inputs.Radio(, label="Select an option")
# ctr_button = gr.inputs.Button(label="Add ControlNet List")
# output_text = gr.outputs.Textbox()
processor = gr.Textbox(
label=f"Control Condition. gradio code now only support dwpose_body_hand, use command can support multi of {control_options}",
value="dwpose_body_hand",
)
gr.Markdown("seed=-1 means that seeds are different in every run")
seed = gr.Number(
label="Seed (seed=-1 means that the seeds run each time are different)",
value=-1,
)
video_length = gr.Number(label="Video Length", value=12)
fps = gr.Number(label="Generate Video FPS", value=6)
gr.Markdown(
(
"If W&H is -1, then use the Reference Image's Size. Size of target video is $(W, H)*img\_edge\_ratio$. \n"
"The shorter the image size, the larger the motion amplitude, and the lower video quality.\n"
"The longer the W&H, the smaller the motion amplitude, and the higher video quality.\n"
"Due to the GPU VRAM limits, the W&H need smaller than 2000px"
)
)
with gr.Row():
w = gr.Number(label="Width", value=-1)
h = gr.Number(label="Height", value=-1)
img_edge_ratio = gr.Number(label="img_edge_ratio", value=1.0)
with gr.Row():
out_w = gr.Number(label="Width", value=0, interactive=False)
out_h = gr.Number(label="Height", value=0, interactive=False)
img_edge_ratio_infact = gr.Number(
label="img_edge_ratio in fact",
value=1.0,
interactive=False,
)
btn2 = gr.Button("Generate")
out1 = gr.Video()
v2v_examples_256 = [
[
"(masterpiece, best quality, highres:1), harley quinn is dancing, animation, by joshua klein",
"../../data/demo/cyber_girl.png",
"../../data/demo/video1.mp4",
],
]
with gr.Row():
gr.Examples(
examples=v2v_examples_256,
inputs=[prompt, image, video],
outputs=[out],
fn=hg_online_v2v_inference,
cache_examples=False,
)
img_edge_ratio.change(
fn=limit_shape,
inputs=[image, w, h, img_edge_ratio],
outputs=[img_edge_ratio_infact, out_w, out_h],
)
video_length.change(
fn=limit_length, inputs=[video_length], outputs=[video_length]
)
btn2.click(
fn=hg_online_v2v_inference,
inputs=[
prompt,
image,
video,
processor,
seed,
fps,
w,
h,
video_length,
img_edge_ratio_infact,
],
outputs=out1,
)
# Set the IP and port
ip_address = "0.0.0.0" # Replace with your desired IP address
port_number = 7860 # Replace with your desired port number
demo.queue().launch(
share=True, debug=True, server_name=ip_address, server_port=port_number
)

View File

@ -1,223 +0,0 @@
import os
import time
import pdb
import cuid
import gradio as gr
from huggingface_hub import snapshot_download
ProjectDir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../.."))
CheckpointsDir = os.path.join(ProjectDir, "checkpoints")
def download_model():
if not os.path.exists(CheckpointsDir):
print("Checkpoint Not Downloaded, start downloading...")
tic = time.time()
snapshot_download(
repo_id="TMElyralab/MuseV",
local_dir=CheckpointsDir,
max_workers=8,
)
toc = time.time()
print(f"download cost {toc-tic} seconds")
else:
print("Already download the model.")
download_model() # for huggingface deployment.
from gradio_video2video import online_v2v_inference
from gradio_text2video import online_t2v_inference
def update_shape(image):
if image != None:
h, w, _ = image.shape
else:
h, w = 768, 512
return w, h
class ConcatenateBlock(gr.blocks.Block):
def __init__(self, options):
self.options = options
self.current_string = ""
def update_string(self, new_choice):
if new_choice and new_choice not in self.current_string.split(", "):
if self.current_string == "":
self.current_string = new_choice
else:
self.current_string += ", " + new_choice
return self.current_string
def process_input(new_choice):
return concatenate_block.update_string(new_choice), ""
control_options = [
"pose",
"pose_body",
"pose_hand",
"pose_face",
"pose_hand_body",
"pose_hand_face",
"dwpose",
"dwpose_face",
"dwpose_hand",
"dwpose_body",
"dwpose_body_hand",
"canny",
"tile",
"hed",
"hed_scribble",
"depth",
"pidi",
"normal_bae",
"lineart",
"lineart_anime",
"zoe",
"sam",
"mobile_sam",
"leres",
"content",
"face_detector",
]
concatenate_block = ConcatenateBlock(control_options)
css = """#input_img {max-width: 1024px !important} #output_vid {max-width: 1024px; max-height: 576px}"""
with gr.Blocks(css=css) as demo:
gr.Markdown(
"<div align='center'> <h1> MuseV: Infinite-length and High Fidelity Virtual Human Video Generation with Visual Conditioned Parallel Denoising</span> </h1> \
<h2 style='font-weight: 450; font-size: 1rem; margin: 0rem'>\
</br>\
Zhiqiang Xia <sup>*</sup>,\
Zhaokang Chen<sup>*</sup>,\
Bin Wu<sup></sup>,\
Chao Li,\
Kwok-Wai Hung,\
Chao Zhan,\
Yingjie He,\
Wenjiang Zhou\
(<sup>*</sup>Equal Contribution, <sup></sup>Corresponding Author, benbinwu@tencent.com)\
</br>\
Lyra Lab, Tencent Music Entertainment\
</h2> \
<a style='font-size:18px;color: #000000' href='https://github.com/TMElyralab/MuseV'>[Github Repo]</a>\
<a style='font-size:18px;color: #000000'>, which is important to Open-Source projects. Thanks!</a>\
<a style='font-size:18px;color: #000000' href=''> [ArXiv(Coming Soon)] </a>\
<a style='font-size:18px;color: #000000' href=''> [Project Page(Coming Soon)] </a> \
<a style='font-size:18px;color: #000000'>If MuseV is useful, please help star the repo~ </a> </div>"
)
with gr.Tab("Text to Video"):
with gr.Row():
with gr.Column():
prompt = gr.Textbox(label="Prompt")
image = gr.Image(label="VisionCondImage")
gr.Markdown("seed=-1 means that the seeds run each time are different")
seed = gr.Number(label="Seed", value=-1)
video_length = gr.Number(label="Video Length", value=12)
fps = gr.Number(label="Generate Video FPS", value=6)
gr.Markdown(
(
"If W&H is -1, then use the Reference Image's Size. Size of target video is $(W, H)*img\_edge\_ratio$. \n"
"The shorter the image size, the larger the motion amplitude, and the lower video quality.\n"
"The longer the W&H, the smaller the motion amplitude, and the higher video quality"
)
)
with gr.Row():
w = gr.Number(label="Width", value=-1)
h = gr.Number(label="Height", value=-1)
img_edge_ratio = gr.Number(label="img_edge_ratio", value=1.0)
btn1 = gr.Button("Generate")
out = gr.outputs.Video()
# pdb.set_trace()
with gr.Row():
board = gr.Dataframe(
value=[["", "", ""]] * 3,
interactive=False,
type="array",
label="Demo Video",
)
# image.change(fn=update_shape, inputs=[image], outputs=[w, h])
btn1.click(
fn=online_t2v_inference,
inputs=[prompt, image, seed, fps, w, h, video_length, img_edge_ratio],
outputs=out,
)
with gr.Tab("Video to Video"):
with gr.Row():
with gr.Column():
prompt = gr.Textbox(label="Prompt")
gr.Markdown(
(
"pose of VisionCondImage should be same as of the first frame of the video. "
"its better generate target first frame whose pose is same as of first frame of the video with text2image tool, sch as MJ, SDXL."
)
)
image = gr.Image(label="VisionCondImage")
video = gr.Video(label="ReferVideo")
# radio = gr.inputs.Radio(, label="Select an option")
# ctr_button = gr.inputs.Button(label="Add ControlNet List")
# output_text = gr.outputs.Textbox()
processor = gr.Textbox(
label=f"Control Condition. gradio code now only support dwpose_body_hand, use command can support multi of {control_options}",
value="dwpose_body_hand",
)
gr.Markdown("seed=-1 means that seeds are different in every run")
seed = gr.Number(label="Seed", value=-1)
video_length = gr.Number(label="Video Length", value=12)
fps = gr.Number(label="Generate Video FPS", value=6)
gr.Markdown(
(
"If W&H is -1, then use the Reference Image's Size. Size of target video is $(W, H)*img\_edge\_ratio$. \n"
"The shorter the image size, the larger the motion amplitude, and the lower video quality. \n"
"The longer the W&H, the smaller the motion amplitude, and the higher video quality. "
)
)
with gr.Row():
w = gr.Number(label="Width", value=-1)
h = gr.Number(label="Height", value=-1)
img_edge_ratio = gr.Number(label="img_edge_ratio", value=1.0)
btn2 = gr.Button("Generate")
out1 = gr.outputs.Video()
# image.change(fn=update_shape, inputs=[image], outputs=[w, h])
btn2.click(
fn=online_v2v_inference,
inputs=[
prompt,
image,
video,
processor,
seed,
fps,
w,
h,
video_length,
img_edge_ratio,
],
outputs=out1,
)
# Set the IP and port
ip_address = "0.0.0.0" # Replace with your desired IP address
port_number = 7860 # Replace with your desired port number
demo.queue().launch(
share=False, debug=True, server_name=ip_address, server_port=port_number
)

View File

@ -45,10 +45,8 @@ from musev.models.unet_loader import load_unet_by_name
from musev.utils.util import save_videos_grid_with_opencv
from musev import logger
need_load_predictor = False
if need_load_predictor:
video_sd_predictor = None
else:
use_v2v_predictor = False
if use_v2v_predictor:
from gradio_video2video import sd_predictor as video_sd_predictor
logger.setLevel("INFO")
@ -464,7 +462,7 @@ def read_image_and_name(path):
return images, name
if referencenet_model_name is not None and need_load_predictor:
if referencenet_model_name is not None and not use_v2v_predictor:
referencenet = load_referencenet_by_name(
model_name=referencenet_model_name,
# sd_model=sd_model_path,
@ -476,7 +474,7 @@ else:
referencenet = None
referencenet_model_name = "no"
if vision_clip_extractor_class_name is not None and need_load_predictor:
if vision_clip_extractor_class_name is not None and not use_v2v_predictor:
vision_clip_extractor = load_vision_clip_encoder_by_name(
ip_image_encoder=vision_clip_model_path,
vision_clip_extractor_class_name=vision_clip_extractor_class_name,
@ -488,7 +486,7 @@ else:
vision_clip_extractor = None
logger.info(f"vision_clip_extractor, None")
if ip_adapter_model_name is not None and need_load_predictor:
if ip_adapter_model_name is not None and not use_v2v_predictor:
ip_adapter_image_proj = load_ip_adapter_image_proj_by_name(
model_name=ip_adapter_model_name,
ip_image_encoder=ip_adapter_model_params_dict.get(
@ -526,11 +524,11 @@ for model_name, sd_model_params in sd_model_params_dict.items():
strict=not (facein_model_name is not None),
need_t2i_ip_adapter_face=ip_adapter_face_model_name is not None,
)
if need_load_predictor
if not use_v2v_predictor
else None
)
if facein_model_name is not None and need_load_predictor:
if facein_model_name is not None and not use_v2v_predictor:
(
face_emb_extractor,
facein_image_proj,
@ -552,7 +550,7 @@ for model_name, sd_model_params in sd_model_params_dict.items():
face_emb_extractor = None
facein_image_proj = None
if ip_adapter_face_model_name is not None and need_load_predictor:
if ip_adapter_face_model_name is not None and not use_v2v_predictor:
(
ip_adapter_face_emb_extractor,
ip_adapter_face_image_proj,
@ -595,10 +593,10 @@ for model_name, sd_model_params in sd_model_params_dict.items():
ip_adapter_face_emb_extractor=ip_adapter_face_emb_extractor,
ip_adapter_face_image_proj=ip_adapter_face_image_proj,
)
if need_load_predictor
if not use_v2v_predictor
else video_sd_predictor
)
if not need_load_predictor:
if use_v2v_predictor:
print(
"text2video use video_sd_predictor, sd_predictor type is ",
type(sd_predictor),