update gradio (#55)
* fix limitation of gpu vram * aad time limitation * reshape logci * <feature> add example and limitation of input * update gradio scripts and requirement --------- Co-authored-by: trumpzhan <trumpzhan@tencent.com>
This commit is contained in:
parent
07fa407dc5
commit
504d705db4
@ -14,6 +14,5 @@ RUN . /opt/conda/etc/profile.d/conda.sh \
|
||||
&& echo "source activate musev" >> ~/.bashrc \
|
||||
&& conda activate musev \
|
||||
&& conda env list \
|
||||
&& pip install cuid
|
||||
|
||||
&& pip --no-cache-dir install cuid gradio==4.12 spaces
|
||||
USER root
|
||||
|
18
README-zh.md
18
README-zh.md
@ -65,8 +65,7 @@ Wenjiang Zhou
|
||||
<td >
|
||||
<video src="https://github.com/TMElyralab/MuseV/assets/163980830/732cf1fd-25e7-494e-b462-969c9425d277" width="100" controls preload></video>
|
||||
</td>
|
||||
<td>(masterpiece, best quality, highres:1),(1girl, solo:1),(beautiful face,
|
||||
soft skin, costume:1),(eye blinks:{eye_blinks_factor}),(head wave:1.3)
|
||||
<td>(masterpiece, best quality, highres:1),(1boy, solo:1),(eye blinks:1.8),(head wave:1.3)
|
||||
</td>
|
||||
</tr>
|
||||
|
||||
@ -159,8 +158,7 @@ Wenjiang Zhou
|
||||
<video src="https://github.com/TMElyralab/MuseV/assets/163980830/28294baa-b996-420f-b1fb-046542adf87d" width="100" controls preload></video>
|
||||
</td>
|
||||
<td>
|
||||
(masterpiece, best quality, highres:1),(1man, solo:1),(beautiful face,
|
||||
soft skin, costume:1),(eye blinks:{eye_blinks_factor}),(head wave:1.3)
|
||||
(masterpiece, best quality, highres:1),(1man, solo:1),(eye blinks:1.8),(head wave:1.3),Chinese ink painting style
|
||||
</td>
|
||||
</tr>
|
||||
|
||||
@ -184,8 +182,7 @@ Wenjiang Zhou
|
||||
<video src="https://github.com/TMElyralab/MuseV/assets/163980830/4072410a-ecea-4ee5-a9b4-735f9f462d51" width="100" controls preload></video>
|
||||
</td>
|
||||
<td>
|
||||
(masterpiece, best quality, highres:1),(1man, solo:1),(beautiful face,
|
||||
soft skin, costume:1),(eye blinks:{eye_blinks_factor}),(head wave:1.3)
|
||||
(masterpiece, best quality, highres:1),(1man, solo:1),(eye blinks:1.8),(head wave:1.3)
|
||||
</td>
|
||||
</tr>
|
||||
<tr>
|
||||
@ -196,8 +193,7 @@ Wenjiang Zhou
|
||||
<video src="https://github.com/TMElyralab/MuseV/assets/163980830/5148beda-a1e1-44f0-ad84-2fb99ad73a11" width="100" controls preload></video>
|
||||
</td>
|
||||
<td>
|
||||
(masterpiece, best quality, highres:1),(1girl, solo:1),(beautiful face,
|
||||
soft skin, costume:1),(eye blinks:{eye_blinks_factor}),(head wave:1.3)
|
||||
(masterpiece, best quality, highres:1),(1man, solo:1),(eye blinks:1.8),(head wave:1.3), animate
|
||||
</td>
|
||||
</tr>
|
||||
<tr>
|
||||
@ -283,7 +279,7 @@ Wenjiang Zhou
|
||||
<video src="https://github.com/TMElyralab/MuseV/assets/163980830/484cc69d-c316-4464-a55b-3df929780a8e" width="400" controls preload></video>
|
||||
</td>
|
||||
<td>
|
||||
(masterpiece, best quality, highres:1)
|
||||
(masterpiece, best quality, highres:1) , a girl is dancing, animation
|
||||
</td>
|
||||
</tr>
|
||||
<tr>
|
||||
@ -294,7 +290,7 @@ Wenjiang Zhou
|
||||
<video src="https://github.com/TMElyralab/MuseV/assets/163980830/c44682e6-aafc-4730-8fc1-72825c1bacf2" width="400" controls preload></video>
|
||||
</td>
|
||||
<td>
|
||||
(masterpiece, best quality, highres:1)
|
||||
(masterpiece, best quality, highres:1), is dancing, animation
|
||||
</td>
|
||||
</tr>
|
||||
</table >
|
||||
@ -374,7 +370,7 @@ pip install -r requirements.txt
|
||||
#### 准备 [openmmlab](https://openmmlab.com/) 包
|
||||
如果不使用 Docker方式,还需要额外安装 mmlab 包。
|
||||
```bash
|
||||
pip install--no-cache-dir -U openmim
|
||||
pip install --no-cache-dir -U openmim
|
||||
mim install mmengine
|
||||
mim install "mmcv>=2.0.1"
|
||||
mim install "mmdet>=3.1.0"
|
||||
|
18
README.md
18
README.md
@ -65,8 +65,7 @@ Examples bellow can be accessed at `configs/tasks/example.yaml`
|
||||
<td >
|
||||
<video src="https://github.com/TMElyralab/MuseV/assets/163980830/732cf1fd-25e7-494e-b462-969c9425d277" width="100" controls preload></video>
|
||||
</td>
|
||||
<td>(masterpiece, best quality, highres:1),(1girl, solo:1),(beautiful face,
|
||||
soft skin, costume:1),(eye blinks:{eye_blinks_factor}),(head wave:1.3)
|
||||
<td>(masterpiece, best quality, highres:1),(1boy, solo:1),(eye blinks:1.8),(head wave:1.3)
|
||||
</td>
|
||||
</tr>
|
||||
|
||||
@ -159,8 +158,7 @@ Examples bellow can be accessed at `configs/tasks/example.yaml`
|
||||
<video src="https://github.com/TMElyralab/MuseV/assets/163980830/28294baa-b996-420f-b1fb-046542adf87d" width="100" controls preload></video>
|
||||
</td>
|
||||
<td>
|
||||
(masterpiece, best quality, highres:1),(1man, solo:1),(beautiful face,
|
||||
soft skin, costume:1),(eye blinks:{eye_blinks_factor}),(head wave:1.3)
|
||||
(masterpiece, best quality, highres:1),(1man, solo:1),(eye blinks:1.8),(head wave:1.3),Chinese ink painting style
|
||||
</td>
|
||||
</tr>
|
||||
|
||||
@ -184,8 +182,7 @@ Examples bellow can be accessed at `configs/tasks/example.yaml`
|
||||
<video src="https://github.com/TMElyralab/MuseV/assets/163980830/4072410a-ecea-4ee5-a9b4-735f9f462d51" width="100" controls preload></video>
|
||||
</td>
|
||||
<td>
|
||||
(masterpiece, best quality, highres:1),(1man, solo:1),(beautiful face,
|
||||
soft skin, costume:1),(eye blinks:{eye_blinks_factor}),(head wave:1.3)
|
||||
(masterpiece, best quality, highres:1),(1man, solo:1),(eye blinks:1.8),(head wave:1.3)
|
||||
</td>
|
||||
</tr>
|
||||
<tr>
|
||||
@ -196,8 +193,7 @@ Examples bellow can be accessed at `configs/tasks/example.yaml`
|
||||
<video src="https://github.com/TMElyralab/MuseV/assets/163980830/5148beda-a1e1-44f0-ad84-2fb99ad73a11" width="100" controls preload></video>
|
||||
</td>
|
||||
<td>
|
||||
(masterpiece, best quality, highres:1),(1girl, solo:1),(beautiful face,
|
||||
soft skin, costume:1),(eye blinks:{eye_blinks_factor}),(head wave:1.3)
|
||||
(masterpiece, best quality, highres:1),(1man, solo:1),(eye blinks:1.8),(head wave:1.3), animate
|
||||
</td>
|
||||
</tr>
|
||||
<tr>
|
||||
@ -279,7 +275,7 @@ In `duffy` mode, pose of the vision condition frame is not aligned with the firs
|
||||
<video src="https://github.com/TMElyralab/MuseV/assets/163980830/484cc69d-c316-4464-a55b-3df929780a8e" width="400" controls preload></video>
|
||||
</td>
|
||||
<td>
|
||||
(masterpiece, best quality, highres:1)
|
||||
(masterpiece, best quality, highres:1) , a girl is dancing, animation
|
||||
</td>
|
||||
</tr>
|
||||
<tr>
|
||||
@ -290,7 +286,7 @@ In `duffy` mode, pose of the vision condition frame is not aligned with the firs
|
||||
<video src="https://github.com/TMElyralab/MuseV/assets/163980830/c44682e6-aafc-4730-8fc1-72825c1bacf2" width="400" controls preload></video>
|
||||
</td>
|
||||
<td>
|
||||
(masterpiece, best quality, highres:1)
|
||||
(masterpiece, best quality, highres:1), is dancing, animation
|
||||
</td>
|
||||
</tr>
|
||||
</table >
|
||||
@ -370,7 +366,7 @@ pip install -r requirements.txt
|
||||
#### Prepare mmlab package
|
||||
if not use docker, should install mmlab package additionally.
|
||||
```bash
|
||||
pip install--no-cache-dir -U openmim
|
||||
pip install --no-cache-dir -U openmim
|
||||
mim install mmengine
|
||||
mim install "mmcv>=2.0.1"
|
||||
mim install "mmdet>=3.1.0"
|
||||
|
@ -15,8 +15,7 @@
|
||||
img_length_ratio: 0.957
|
||||
ipadapter_image: ${.condition_images}
|
||||
name: yongen
|
||||
prompt: (masterpiece, best quality, highres:1),(1girl, solo:1),(beautiful face,
|
||||
soft skin, costume:1),(eye blinks:{eye_blinks_factor}),(head wave:1.3)
|
||||
prompt: (masterpiece, best quality, highres:1),(1boy, solo:1),(eye blinks:1.8),(head wave:1.3)
|
||||
refer_image: ${.condition_images}
|
||||
video_path: null
|
||||
width: 736
|
||||
@ -97,8 +96,7 @@
|
||||
img_length_ratio: 1.495
|
||||
ipadapter_image: ${.condition_images}
|
||||
name: dufu
|
||||
prompt: (masterpiece, best quality, highres:1),(1girl, solo:1),(beautiful face,
|
||||
soft skin, costume:1),(eye blinks:{eye_blinks_factor}),(head wave:1.3)
|
||||
prompt: (masterpiece, best quality, highres:1),(1man, solo:1),(eye blinks:1.8),(head wave:1.3),Chinese ink painting style
|
||||
refer_image: ${.condition_images}
|
||||
video_path: null
|
||||
width: 471
|
||||
@ -119,8 +117,7 @@
|
||||
img_length_ratio: 0.88
|
||||
ipadapter_image: ${.condition_images}
|
||||
name: Portrait-of-Dr.-Gachet
|
||||
prompt: (masterpiece, best quality, highres:1),(1girl, solo:1),(beautiful face,
|
||||
soft skin, costume:1),(eye blinks:{eye_blinks_factor}),(head wave:1.3)
|
||||
prompt: (masterpiece, best quality, highres:1),(1man, solo:1),(eye blinks:1.8),(head wave:1.3)
|
||||
refer_image: ${.condition_images}
|
||||
video_path: null
|
||||
width: 800
|
||||
@ -130,8 +127,7 @@
|
||||
img_length_ratio: 1.246
|
||||
ipadapter_image: ${.condition_images}
|
||||
name: Self-Portrait-with-Cropped-Hair
|
||||
prompt: (masterpiece, best quality, highres:1),(1girl, solo:1),(beautiful face,
|
||||
soft skin, costume:1),(eye blinks:{eye_blinks_factor}),(head wave:1.3)
|
||||
prompt: (masterpiece, best quality, highres:1),(1boy, solo:1),(eye blinks:1.8),(head wave:1.3), animate
|
||||
refer_image: ${.condition_images}
|
||||
video_path: null
|
||||
width: 848
|
||||
@ -141,8 +137,7 @@
|
||||
img_length_ratio: 0.587
|
||||
ipadapter_image: ${.condition_images}
|
||||
name: The-Laughing-Cavalier
|
||||
prompt: (masterpiece, best quality, highres:1),(1girl, solo:1),(beautiful face,
|
||||
soft skin, costume:1),(eye blinks:{eye_blinks_factor}),(head wave:1.3)
|
||||
prompt: (masterpiece, best quality, highres:1),(1man, solo:1),(eye blinks:1.8),(head wave:1.3)
|
||||
refer_image: ${.condition_images}
|
||||
video_path: null
|
||||
width: 1200
|
||||
|
@ -1 +1 @@
|
||||
Subproject commit 54a432af35f4f264f4a8361c7387fa8092c2dd7f
|
||||
Subproject commit 54c6c49baf68bff290679f5bb896715f25932133
|
BIN
data/demo/cyber_girl.png
Normal file
BIN
data/demo/cyber_girl.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 110 KiB |
BIN
data/demo/video1.mp4
Normal file
BIN
data/demo/video1.mp4
Normal file
Binary file not shown.
@ -452,6 +452,43 @@ class DiffusersPipelinePredictor(object):
|
||||
2. when input paramter is None, use text2video to generate vis cond image, and use as refer_image and ip_adapter_image too.
|
||||
3. given from input paramter, but still redraw, update with redrawn vis cond image.
|
||||
"""
|
||||
# crop resize images
|
||||
if condition_images is not None:
|
||||
logger.debug(
|
||||
f"center crop resize condition_images={condition_images.shape}, to height={height}, width={width}"
|
||||
)
|
||||
condition_images = batch_dynamic_crop_resize_images_v2(
|
||||
condition_images,
|
||||
target_height=height,
|
||||
target_width=width,
|
||||
)
|
||||
if refer_image is not None:
|
||||
logger.debug(
|
||||
f"center crop resize refer_image to height={height}, width={width}"
|
||||
)
|
||||
refer_image = batch_dynamic_crop_resize_images_v2(
|
||||
refer_image,
|
||||
target_height=height,
|
||||
target_width=width,
|
||||
)
|
||||
if ip_adapter_image is not None:
|
||||
logger.debug(
|
||||
f"center crop resize ip_adapter_image to height={height}, width={width}"
|
||||
)
|
||||
ip_adapter_image = batch_dynamic_crop_resize_images_v2(
|
||||
ip_adapter_image,
|
||||
target_height=height,
|
||||
target_width=width,
|
||||
)
|
||||
if refer_face_image is not None:
|
||||
logger.debug(
|
||||
f"center crop resize refer_face_image to height={height}, width={width}"
|
||||
)
|
||||
refer_face_image = batch_dynamic_crop_resize_images_v2(
|
||||
refer_face_image,
|
||||
target_height=height,
|
||||
target_width=width,
|
||||
)
|
||||
run_video_length = video_length
|
||||
# generate vision condition frame start
|
||||
# if condition_images is None, generate with refer_image, ip_adapter_image
|
||||
|
@ -1,6 +1,6 @@
|
||||
|
||||
tensorflow==2.12.0
|
||||
tensorboard==2.12.0
|
||||
# tensorflow==2.12.0
|
||||
# tensorboard==2.12.0
|
||||
|
||||
# torch==2.0.1+cu118
|
||||
# torchvision==0.15.2+cu118
|
||||
@ -50,18 +50,17 @@ requests
|
||||
scipy
|
||||
six
|
||||
tqdm
|
||||
gradio==3.43.2
|
||||
gradio==4.12
|
||||
albumentations==1.3.1
|
||||
opencv-contrib-python==4.8.0.76
|
||||
imageio-ffmpeg==0.4.8
|
||||
pytorch-lightning==2.0.8
|
||||
test-tube==0.7.5
|
||||
timm
|
||||
timm==0.9.12
|
||||
addict
|
||||
yapf
|
||||
prettytable
|
||||
safetensors==0.3.3
|
||||
basicsr
|
||||
fvcore
|
||||
pycocotools
|
||||
wandb==0.15.10
|
||||
@ -88,5 +87,16 @@ IProgress==0.4
|
||||
markupsafe==2.0.1
|
||||
xlsxwriter
|
||||
cuid
|
||||
spaces
|
||||
|
||||
# https://mirrors.cloud.tencent.com/pypi/packages/de/a6/a49d5af79a515f5c9552a26b2078d839c40fcf8dccc0d94a1269276ab181/tb_nightly-2.1.0a20191022-py3-none-any.whl
|
||||
basicsr
|
||||
|
||||
git+https://github.com/tencent-ailab/IP-Adapter.git@main
|
||||
git+https://github.com/openai/CLIP.git@main
|
||||
|
||||
git+https://github.com/TMElyralab/controlnet_aux.git@tme
|
||||
git+https://github.com/TMElyralab/diffusers.git@tme
|
||||
git+https://github.com/TMElyralab/MMCM.git@main
|
||||
|
||||
numpy==1.23.5
|
@ -4,12 +4,14 @@ import pdb
|
||||
|
||||
import cuid
|
||||
import gradio as gr
|
||||
|
||||
import spaces
|
||||
import numpy as np
|
||||
|
||||
from huggingface_hub import snapshot_download
|
||||
|
||||
ProjectDir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../.."))
|
||||
CheckpointsDir = os.path.join(ProjectDir, "checkpoints")
|
||||
ignore_video2video = False
|
||||
|
||||
|
||||
def download_model():
|
||||
@ -28,17 +30,91 @@ def download_model():
|
||||
|
||||
|
||||
download_model() # for huggingface deployment.
|
||||
|
||||
from gradio_video2video import online_v2v_inference
|
||||
if not ignore_video2video:
|
||||
from gradio_video2video import online_v2v_inference
|
||||
from gradio_text2video import online_t2v_inference
|
||||
|
||||
|
||||
def update_shape(image):
|
||||
if image != None:
|
||||
h, w, _ = image.shape
|
||||
@spaces.GPU(duration=180)
|
||||
def hf_online_t2v_inference(
|
||||
prompt,
|
||||
image_np,
|
||||
seed,
|
||||
fps,
|
||||
w,
|
||||
h,
|
||||
video_len,
|
||||
img_edge_ratio,
|
||||
):
|
||||
if not isinstance(image_np, np.ndarray): # None
|
||||
raise gr.Error("Need input reference image")
|
||||
return online_t2v_inference(
|
||||
prompt, image_np, seed, fps, w, h, video_len, img_edge_ratio
|
||||
)
|
||||
|
||||
|
||||
@spaces.GPU(duration=180)
|
||||
def hg_online_v2v_inference(
|
||||
prompt,
|
||||
image_np,
|
||||
video,
|
||||
processor,
|
||||
seed,
|
||||
fps,
|
||||
w,
|
||||
h,
|
||||
video_length,
|
||||
img_edge_ratio,
|
||||
):
|
||||
if not isinstance(image_np, np.ndarray): # None
|
||||
raise gr.Error("Need input reference image")
|
||||
return online_v2v_inference(
|
||||
prompt,
|
||||
image_np,
|
||||
video,
|
||||
processor,
|
||||
seed,
|
||||
fps,
|
||||
w,
|
||||
h,
|
||||
video_length,
|
||||
img_edge_ratio,
|
||||
)
|
||||
|
||||
|
||||
def limit_shape(image, input_w, input_h, img_edge_ratio, max_image_edge=960):
|
||||
"""limite generation video shape to avoid gpu memory overflow"""
|
||||
if isinstance(image, np.ndarray) and (input_h == -1 and input_w == -1):
|
||||
input_h, input_w, _ = image.shape
|
||||
h, w, _ = image.shape
|
||||
if img_edge_ratio == 0:
|
||||
img_edge_ratio = 1
|
||||
img_edge_ratio_infact = min(max_image_edge / max(input_h, input_w), img_edge_ratio)
|
||||
# print(
|
||||
# image.shape,
|
||||
# input_w,
|
||||
# input_h,
|
||||
# img_edge_ratio,
|
||||
# max_image_edge,
|
||||
# img_edge_ratio_infact,
|
||||
# )
|
||||
if img_edge_ratio != 1:
|
||||
return (
|
||||
img_edge_ratio_infact,
|
||||
input_w * img_edge_ratio_infact,
|
||||
input_h * img_edge_ratio_infact,
|
||||
)
|
||||
else:
|
||||
h, w = 768, 512
|
||||
return w, h
|
||||
return img_edge_ratio_infact, -1, -1
|
||||
|
||||
|
||||
def limit_length(length):
|
||||
"""limite generation video frames numer to avoid gpu memory overflow"""
|
||||
|
||||
if length > 24 * 6:
|
||||
gr.Warning("Length need to smaller than 144, dute to gpu memory limit")
|
||||
length = 24 * 6
|
||||
return length
|
||||
|
||||
|
||||
class ConcatenateBlock(gr.blocks.Block):
|
||||
@ -121,97 +197,179 @@ with gr.Blocks(css=css) as demo:
|
||||
with gr.Column():
|
||||
prompt = gr.Textbox(label="Prompt")
|
||||
image = gr.Image(label="VisionCondImage")
|
||||
gr.Markdown("seed=-1 means that the seeds run each time are different")
|
||||
seed = gr.Number(label="Seed", value=-1)
|
||||
video_length = gr.Number(label="Video Length", value=12)
|
||||
seed = gr.Number(
|
||||
label="Seed (seed=-1 means that the seeds run each time are different)",
|
||||
value=-1,
|
||||
)
|
||||
video_length = gr.Number(
|
||||
label="Video Length(need smaller than 144,If you want to be able to generate longer videos, run it locally )",
|
||||
value=12,
|
||||
)
|
||||
fps = gr.Number(label="Generate Video FPS", value=6)
|
||||
gr.Markdown(
|
||||
(
|
||||
"If W&H is -1, then use the Reference Image's Size. Size of target video is $(W, H)*img\_edge\_ratio$. \n"
|
||||
"The shorter the image size, the larger the motion amplitude, and the lower video quality.\n"
|
||||
"The longer the W&H, the smaller the motion amplitude, and the higher video quality"
|
||||
"The longer the W&H, the smaller the motion amplitude, and the higher video quality.\n"
|
||||
"Due to the GPU VRAM limits, the W&H need smaller than 960px"
|
||||
)
|
||||
)
|
||||
with gr.Row():
|
||||
w = gr.Number(label="Width", value=-1)
|
||||
h = gr.Number(label="Height", value=-1)
|
||||
img_edge_ratio = gr.Number(label="img_edge_ratio", value=1.0)
|
||||
|
||||
with gr.Row():
|
||||
out_w = gr.Number(label="Output Width", value=0, interactive=False)
|
||||
out_h = gr.Number(label="Output Height", value=0, interactive=False)
|
||||
img_edge_ratio_infact = gr.Number(
|
||||
label="img_edge_ratio in fact",
|
||||
value=1.0,
|
||||
interactive=False,
|
||||
)
|
||||
btn1 = gr.Button("Generate")
|
||||
out = gr.outputs.Video()
|
||||
out = gr.Video()
|
||||
# pdb.set_trace()
|
||||
i2v_examples_256 = [
|
||||
[
|
||||
"(masterpiece, best quality, highres:1),(1boy, solo:1),(eye blinks:1.8),(head wave:1.3)",
|
||||
"../../data/images/yongen.jpeg",
|
||||
],
|
||||
[
|
||||
"(masterpiece, best quality, highres:1),(1man, solo:1),(eye blinks:1.8),(head wave:1.3)",
|
||||
"../../data/images/The-Laughing-Cavalier.jpg",
|
||||
],
|
||||
]
|
||||
with gr.Row():
|
||||
board = gr.Dataframe(
|
||||
value=[["", "", ""]] * 3,
|
||||
interactive=False,
|
||||
type="array",
|
||||
label="Demo Video",
|
||||
gr.Examples(
|
||||
examples=i2v_examples_256,
|
||||
inputs=[prompt, image],
|
||||
outputs=[out],
|
||||
fn=hf_online_t2v_inference,
|
||||
cache_examples=False,
|
||||
)
|
||||
|
||||
# image.change(fn=update_shape, inputs=[image], outputs=[w, h])
|
||||
|
||||
btn1.click(
|
||||
fn=online_t2v_inference,
|
||||
inputs=[prompt, image, seed, fps, w, h, video_length, img_edge_ratio],
|
||||
outputs=out,
|
||||
img_edge_ratio.change(
|
||||
fn=limit_shape,
|
||||
inputs=[image, w, h, img_edge_ratio],
|
||||
outputs=[img_edge_ratio_infact, out_w, out_h],
|
||||
)
|
||||
|
||||
with gr.Tab("Video to Video"):
|
||||
with gr.Row():
|
||||
with gr.Column():
|
||||
prompt = gr.Textbox(label="Prompt")
|
||||
gr.Markdown(
|
||||
(
|
||||
"pose of VisionCondImage should be same as of the first frame of the video. "
|
||||
"its better generate target first frame whose pose is same as of first frame of the video with text2image tool, sch as MJ, SDXL."
|
||||
)
|
||||
)
|
||||
image = gr.Image(label="VisionCondImage")
|
||||
video = gr.Video(label="ReferVideo")
|
||||
# radio = gr.inputs.Radio(, label="Select an option")
|
||||
# ctr_button = gr.inputs.Button(label="Add ControlNet List")
|
||||
# output_text = gr.outputs.Textbox()
|
||||
processor = gr.Textbox(
|
||||
label=f"Control Condition. gradio code now only support dwpose_body_hand, use command can support multi of {control_options}",
|
||||
value="dwpose_body_hand",
|
||||
)
|
||||
gr.Markdown("seed=-1 means that seeds are different in every run")
|
||||
seed = gr.Number(label="Seed", value=-1)
|
||||
video_length = gr.Number(label="Video Length", value=12)
|
||||
fps = gr.Number(label="Generate Video FPS", value=6)
|
||||
gr.Markdown(
|
||||
(
|
||||
"If W&H is -1, then use the Reference Image's Size. Size of target video is $(W, H)*img\_edge\_ratio$. \n"
|
||||
"The shorter the image size, the larger the motion amplitude, and the lower video quality. \n"
|
||||
"The longer the W&H, the smaller the motion amplitude, and the higher video quality. "
|
||||
)
|
||||
)
|
||||
with gr.Row():
|
||||
w = gr.Number(label="Width", value=-1)
|
||||
h = gr.Number(label="Height", value=-1)
|
||||
img_edge_ratio = gr.Number(label="img_edge_ratio", value=1.0)
|
||||
video_length.change(
|
||||
fn=limit_length, inputs=[video_length], outputs=[video_length]
|
||||
)
|
||||
|
||||
btn2 = gr.Button("Generate")
|
||||
out1 = gr.outputs.Video()
|
||||
# image.change(fn=update_shape, inputs=[image], outputs=[w, h])
|
||||
|
||||
btn2.click(
|
||||
fn=online_v2v_inference,
|
||||
btn1.click(
|
||||
fn=hf_online_t2v_inference,
|
||||
inputs=[
|
||||
prompt,
|
||||
image,
|
||||
video,
|
||||
processor,
|
||||
seed,
|
||||
fps,
|
||||
w,
|
||||
h,
|
||||
video_length,
|
||||
img_edge_ratio,
|
||||
img_edge_ratio_infact,
|
||||
],
|
||||
outputs=out1,
|
||||
outputs=out,
|
||||
)
|
||||
|
||||
with gr.Tab("Video to Video"):
|
||||
if ignore_video2video:
|
||||
gr.Markdown(
|
||||
(
|
||||
"Due to GPU limit, MuseVDemo now only support Text2Video. If you want to try Video2Video, please run it locally. \n"
|
||||
"We are trying to support video2video in the future. Thanks for your understanding."
|
||||
)
|
||||
)
|
||||
else:
|
||||
with gr.Row():
|
||||
with gr.Column():
|
||||
prompt = gr.Textbox(label="Prompt")
|
||||
gr.Markdown(
|
||||
(
|
||||
"pose of VisionCondImage should be same as of the first frame of the video. "
|
||||
"its better generate target first frame whose pose is same as of first frame of the video with text2image tool, sch as MJ, SDXL."
|
||||
)
|
||||
)
|
||||
image = gr.Image(label="VisionCondImage")
|
||||
video = gr.Video(label="ReferVideo")
|
||||
# radio = gr.inputs.Radio(, label="Select an option")
|
||||
# ctr_button = gr.inputs.Button(label="Add ControlNet List")
|
||||
# output_text = gr.outputs.Textbox()
|
||||
processor = gr.Textbox(
|
||||
label=f"Control Condition. gradio code now only support dwpose_body_hand, use command can support multi of {control_options}",
|
||||
value="dwpose_body_hand",
|
||||
)
|
||||
gr.Markdown("seed=-1 means that seeds are different in every run")
|
||||
seed = gr.Number(
|
||||
label="Seed (seed=-1 means that the seeds run each time are different)",
|
||||
value=-1,
|
||||
)
|
||||
video_length = gr.Number(label="Video Length", value=12)
|
||||
fps = gr.Number(label="Generate Video FPS", value=6)
|
||||
gr.Markdown(
|
||||
(
|
||||
"If W&H is -1, then use the Reference Image's Size. Size of target video is $(W, H)*img\_edge\_ratio$. \n"
|
||||
"The shorter the image size, the larger the motion amplitude, and the lower video quality.\n"
|
||||
"The longer the W&H, the smaller the motion amplitude, and the higher video quality.\n"
|
||||
"Due to the GPU VRAM limits, the W&H need smaller than 2000px"
|
||||
)
|
||||
)
|
||||
with gr.Row():
|
||||
w = gr.Number(label="Width", value=-1)
|
||||
h = gr.Number(label="Height", value=-1)
|
||||
img_edge_ratio = gr.Number(label="img_edge_ratio", value=1.0)
|
||||
|
||||
with gr.Row():
|
||||
out_w = gr.Number(label="Width", value=0, interactive=False)
|
||||
out_h = gr.Number(label="Height", value=0, interactive=False)
|
||||
img_edge_ratio_infact = gr.Number(
|
||||
label="img_edge_ratio in fact",
|
||||
value=1.0,
|
||||
interactive=False,
|
||||
)
|
||||
btn2 = gr.Button("Generate")
|
||||
out1 = gr.Video()
|
||||
|
||||
v2v_examples_256 = [
|
||||
[
|
||||
"(masterpiece, best quality, highres:1), harley quinn is dancing, animation, by joshua klein",
|
||||
"../../data/demo/cyber_girl.png",
|
||||
"../../data/demo/video1.mp4",
|
||||
],
|
||||
]
|
||||
with gr.Row():
|
||||
gr.Examples(
|
||||
examples=v2v_examples_256,
|
||||
inputs=[prompt, image, video],
|
||||
outputs=[out],
|
||||
fn=hg_online_v2v_inference,
|
||||
cache_examples=False,
|
||||
)
|
||||
img_edge_ratio.change(
|
||||
fn=limit_shape,
|
||||
inputs=[image, w, h, img_edge_ratio],
|
||||
outputs=[img_edge_ratio_infact, out_w, out_h],
|
||||
)
|
||||
video_length.change(
|
||||
fn=limit_length, inputs=[video_length], outputs=[video_length]
|
||||
)
|
||||
btn2.click(
|
||||
fn=hg_online_v2v_inference,
|
||||
inputs=[
|
||||
prompt,
|
||||
image,
|
||||
video,
|
||||
processor,
|
||||
seed,
|
||||
fps,
|
||||
w,
|
||||
h,
|
||||
video_length,
|
||||
img_edge_ratio_infact,
|
||||
],
|
||||
outputs=out1,
|
||||
)
|
||||
|
||||
|
||||
# Set the IP and port
|
||||
ip_address = "0.0.0.0" # Replace with your desired IP address
|
||||
@ -219,5 +377,5 @@ port_number = 7860 # Replace with your desired port number
|
||||
|
||||
|
||||
demo.queue().launch(
|
||||
share=False, debug=True, server_name=ip_address, server_port=port_number
|
||||
share=True, debug=True, server_name=ip_address, server_port=port_number
|
||||
)
|
||||
|
381
scripts/gradio/app_docker_space.py
Normal file
381
scripts/gradio/app_docker_space.py
Normal file
@ -0,0 +1,381 @@
|
||||
import os
|
||||
import time
|
||||
import pdb
|
||||
|
||||
import cuid
|
||||
import gradio as gr
|
||||
import spaces
|
||||
import numpy as np
|
||||
|
||||
from huggingface_hub import snapshot_download
|
||||
|
||||
ProjectDir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../.."))
|
||||
CheckpointsDir = os.path.join(ProjectDir, "checkpoints")
|
||||
ignore_video2video = True
|
||||
|
||||
|
||||
def download_model():
|
||||
if not os.path.exists(CheckpointsDir):
|
||||
print("Checkpoint Not Downloaded, start downloading...")
|
||||
tic = time.time()
|
||||
snapshot_download(
|
||||
repo_id="TMElyralab/MuseV",
|
||||
local_dir=CheckpointsDir,
|
||||
max_workers=8,
|
||||
)
|
||||
toc = time.time()
|
||||
print(f"download cost {toc-tic} seconds")
|
||||
else:
|
||||
print("Already download the model.")
|
||||
|
||||
|
||||
download_model() # for huggingface deployment.
|
||||
if not ignore_video2video:
|
||||
from gradio_video2video import online_v2v_inference
|
||||
from gradio_text2video import online_t2v_inference
|
||||
|
||||
|
||||
@spaces.GPU(duration=180)
|
||||
def hf_online_t2v_inference(
|
||||
prompt,
|
||||
image_np,
|
||||
seed,
|
||||
fps,
|
||||
w,
|
||||
h,
|
||||
video_len,
|
||||
img_edge_ratio,
|
||||
):
|
||||
if not isinstance(image_np, np.ndarray): # None
|
||||
raise gr.Error("Need input reference image")
|
||||
return online_t2v_inference(
|
||||
prompt, image_np, seed, fps, w, h, video_len, img_edge_ratio
|
||||
)
|
||||
|
||||
|
||||
@spaces.GPU(duration=180)
|
||||
def hg_online_v2v_inference(
|
||||
prompt,
|
||||
image_np,
|
||||
video,
|
||||
processor,
|
||||
seed,
|
||||
fps,
|
||||
w,
|
||||
h,
|
||||
video_length,
|
||||
img_edge_ratio,
|
||||
):
|
||||
if not isinstance(image_np, np.ndarray): # None
|
||||
raise gr.Error("Need input reference image")
|
||||
return online_v2v_inference(
|
||||
prompt,
|
||||
image_np,
|
||||
video,
|
||||
processor,
|
||||
seed,
|
||||
fps,
|
||||
w,
|
||||
h,
|
||||
video_length,
|
||||
img_edge_ratio,
|
||||
)
|
||||
|
||||
|
||||
def limit_shape(image, input_w, input_h, img_edge_ratio, max_image_edge=960):
|
||||
"""limite generation video shape to avoid gpu memory overflow"""
|
||||
if isinstance(image, np.ndarray) and (input_h == -1 and input_w == -1):
|
||||
input_h, input_w, _ = image.shape
|
||||
h, w, _ = image.shape
|
||||
if img_edge_ratio == 0:
|
||||
img_edge_ratio = 1
|
||||
img_edge_ratio_infact = min(max_image_edge / max(input_h, input_w), img_edge_ratio)
|
||||
# print(
|
||||
# image.shape,
|
||||
# input_w,
|
||||
# input_h,
|
||||
# img_edge_ratio,
|
||||
# max_image_edge,
|
||||
# img_edge_ratio_infact,
|
||||
# )
|
||||
if img_edge_ratio != 1:
|
||||
return (
|
||||
img_edge_ratio_infact,
|
||||
input_w * img_edge_ratio_infact,
|
||||
input_h * img_edge_ratio_infact,
|
||||
)
|
||||
else:
|
||||
return img_edge_ratio_infact, -1, -1
|
||||
|
||||
|
||||
def limit_length(length):
|
||||
"""limite generation video frames numer to avoid gpu memory overflow"""
|
||||
|
||||
if length > 24 * 6:
|
||||
gr.Warning("Length need to smaller than 144, dute to gpu memory limit")
|
||||
length = 24 * 6
|
||||
return length
|
||||
|
||||
|
||||
class ConcatenateBlock(gr.blocks.Block):
|
||||
def __init__(self, options):
|
||||
self.options = options
|
||||
self.current_string = ""
|
||||
|
||||
def update_string(self, new_choice):
|
||||
if new_choice and new_choice not in self.current_string.split(", "):
|
||||
if self.current_string == "":
|
||||
self.current_string = new_choice
|
||||
else:
|
||||
self.current_string += ", " + new_choice
|
||||
return self.current_string
|
||||
|
||||
|
||||
def process_input(new_choice):
|
||||
return concatenate_block.update_string(new_choice), ""
|
||||
|
||||
|
||||
control_options = [
|
||||
"pose",
|
||||
"pose_body",
|
||||
"pose_hand",
|
||||
"pose_face",
|
||||
"pose_hand_body",
|
||||
"pose_hand_face",
|
||||
"dwpose",
|
||||
"dwpose_face",
|
||||
"dwpose_hand",
|
||||
"dwpose_body",
|
||||
"dwpose_body_hand",
|
||||
"canny",
|
||||
"tile",
|
||||
"hed",
|
||||
"hed_scribble",
|
||||
"depth",
|
||||
"pidi",
|
||||
"normal_bae",
|
||||
"lineart",
|
||||
"lineart_anime",
|
||||
"zoe",
|
||||
"sam",
|
||||
"mobile_sam",
|
||||
"leres",
|
||||
"content",
|
||||
"face_detector",
|
||||
]
|
||||
concatenate_block = ConcatenateBlock(control_options)
|
||||
|
||||
|
||||
css = """#input_img {max-width: 1024px !important} #output_vid {max-width: 1024px; max-height: 576px}"""
|
||||
|
||||
|
||||
with gr.Blocks(css=css) as demo:
|
||||
gr.Markdown(
|
||||
"<div align='center'> <h1> MuseV: Infinite-length and High Fidelity Virtual Human Video Generation with Visual Conditioned Parallel Denoising</span> </h1> \
|
||||
<h2 style='font-weight: 450; font-size: 1rem; margin: 0rem'>\
|
||||
</br>\
|
||||
Zhiqiang Xia <sup>*</sup>,\
|
||||
Zhaokang Chen<sup>*</sup>,\
|
||||
Bin Wu<sup>†</sup>,\
|
||||
Chao Li,\
|
||||
Kwok-Wai Hung,\
|
||||
Chao Zhan,\
|
||||
Yingjie He,\
|
||||
Wenjiang Zhou\
|
||||
(<sup>*</sup>Equal Contribution, <sup>†</sup>Corresponding Author, benbinwu@tencent.com)\
|
||||
</br>\
|
||||
Lyra Lab, Tencent Music Entertainment\
|
||||
</h2> \
|
||||
<a style='font-size:18px;color: #000000' href='https://github.com/TMElyralab/MuseV'>[Github Repo]</a>\
|
||||
<a style='font-size:18px;color: #000000'>, which is important to Open-Source projects. Thanks!</a>\
|
||||
<a style='font-size:18px;color: #000000' href=''> [ArXiv(Coming Soon)] </a>\
|
||||
<a style='font-size:18px;color: #000000' href=''> [Project Page(Coming Soon)] </a> \
|
||||
<a style='font-size:18px;color: #000000'>If MuseV is useful, please help star the repo~ </a> </div>"
|
||||
)
|
||||
with gr.Tab("Text to Video"):
|
||||
with gr.Row():
|
||||
with gr.Column():
|
||||
prompt = gr.Textbox(label="Prompt")
|
||||
image = gr.Image(label="VisionCondImage")
|
||||
seed = gr.Number(
|
||||
label="Seed (seed=-1 means that the seeds run each time are different)",
|
||||
value=-1,
|
||||
)
|
||||
video_length = gr.Number(
|
||||
label="Video Length(need smaller than 144,If you want to be able to generate longer videos, run it locally )",
|
||||
value=12,
|
||||
)
|
||||
fps = gr.Number(label="Generate Video FPS", value=6)
|
||||
gr.Markdown(
|
||||
(
|
||||
"If W&H is -1, then use the Reference Image's Size. Size of target video is $(W, H)*img\_edge\_ratio$. \n"
|
||||
"The shorter the image size, the larger the motion amplitude, and the lower video quality.\n"
|
||||
"The longer the W&H, the smaller the motion amplitude, and the higher video quality.\n"
|
||||
"Due to the GPU VRAM limits, the W&H need smaller than 960px"
|
||||
)
|
||||
)
|
||||
with gr.Row():
|
||||
w = gr.Number(label="Width", value=-1)
|
||||
h = gr.Number(label="Height", value=-1)
|
||||
img_edge_ratio = gr.Number(label="img_edge_ratio", value=1.0)
|
||||
with gr.Row():
|
||||
out_w = gr.Number(label="Output Width", value=0, interactive=False)
|
||||
out_h = gr.Number(label="Output Height", value=0, interactive=False)
|
||||
img_edge_ratio_infact = gr.Number(
|
||||
label="img_edge_ratio in fact",
|
||||
value=1.0,
|
||||
interactive=False,
|
||||
)
|
||||
btn1 = gr.Button("Generate")
|
||||
out = gr.Video()
|
||||
# pdb.set_trace()
|
||||
i2v_examples_256 = [
|
||||
[
|
||||
"(masterpiece, best quality, highres:1),(1boy, solo:1),(eye blinks:1.8),(head wave:1.3)",
|
||||
"../../data/images/yongen.jpeg",
|
||||
],
|
||||
[
|
||||
"(masterpiece, best quality, highres:1),(1man, solo:1),(eye blinks:1.8),(head wave:1.3)",
|
||||
"../../data/images/The-Laughing-Cavalier.jpg",
|
||||
],
|
||||
]
|
||||
with gr.Row():
|
||||
gr.Examples(
|
||||
examples=i2v_examples_256,
|
||||
inputs=[prompt, image],
|
||||
outputs=[out],
|
||||
fn=hf_online_t2v_inference,
|
||||
cache_examples=False,
|
||||
)
|
||||
img_edge_ratio.change(
|
||||
fn=limit_shape,
|
||||
inputs=[image, w, h, img_edge_ratio],
|
||||
outputs=[img_edge_ratio_infact, out_w, out_h],
|
||||
)
|
||||
|
||||
video_length.change(
|
||||
fn=limit_length, inputs=[video_length], outputs=[video_length]
|
||||
)
|
||||
|
||||
btn1.click(
|
||||
fn=hf_online_t2v_inference,
|
||||
inputs=[
|
||||
prompt,
|
||||
image,
|
||||
seed,
|
||||
fps,
|
||||
w,
|
||||
h,
|
||||
video_length,
|
||||
img_edge_ratio_infact,
|
||||
],
|
||||
outputs=out,
|
||||
)
|
||||
|
||||
with gr.Tab("Video to Video"):
|
||||
if ignore_video2video:
|
||||
gr.Markdown(
|
||||
(
|
||||
"Due to GPU limit, MuseVDemo now only support Text2Video. If you want to try Video2Video, please run it locally. \n"
|
||||
"We are trying to support video2video in the future. Thanks for your understanding."
|
||||
)
|
||||
)
|
||||
else:
|
||||
with gr.Row():
|
||||
with gr.Column():
|
||||
prompt = gr.Textbox(label="Prompt")
|
||||
gr.Markdown(
|
||||
(
|
||||
"pose of VisionCondImage should be same as of the first frame of the video. "
|
||||
"its better generate target first frame whose pose is same as of first frame of the video with text2image tool, sch as MJ, SDXL."
|
||||
)
|
||||
)
|
||||
image = gr.Image(label="VisionCondImage")
|
||||
video = gr.Video(label="ReferVideo")
|
||||
# radio = gr.inputs.Radio(, label="Select an option")
|
||||
# ctr_button = gr.inputs.Button(label="Add ControlNet List")
|
||||
# output_text = gr.outputs.Textbox()
|
||||
processor = gr.Textbox(
|
||||
label=f"Control Condition. gradio code now only support dwpose_body_hand, use command can support multi of {control_options}",
|
||||
value="dwpose_body_hand",
|
||||
)
|
||||
gr.Markdown("seed=-1 means that seeds are different in every run")
|
||||
seed = gr.Number(
|
||||
label="Seed (seed=-1 means that the seeds run each time are different)",
|
||||
value=-1,
|
||||
)
|
||||
video_length = gr.Number(label="Video Length", value=12)
|
||||
fps = gr.Number(label="Generate Video FPS", value=6)
|
||||
gr.Markdown(
|
||||
(
|
||||
"If W&H is -1, then use the Reference Image's Size. Size of target video is $(W, H)*img\_edge\_ratio$. \n"
|
||||
"The shorter the image size, the larger the motion amplitude, and the lower video quality.\n"
|
||||
"The longer the W&H, the smaller the motion amplitude, and the higher video quality.\n"
|
||||
"Due to the GPU VRAM limits, the W&H need smaller than 2000px"
|
||||
)
|
||||
)
|
||||
with gr.Row():
|
||||
w = gr.Number(label="Width", value=-1)
|
||||
h = gr.Number(label="Height", value=-1)
|
||||
img_edge_ratio = gr.Number(label="img_edge_ratio", value=1.0)
|
||||
|
||||
with gr.Row():
|
||||
out_w = gr.Number(label="Width", value=0, interactive=False)
|
||||
out_h = gr.Number(label="Height", value=0, interactive=False)
|
||||
img_edge_ratio_infact = gr.Number(
|
||||
label="img_edge_ratio in fact",
|
||||
value=1.0,
|
||||
interactive=False,
|
||||
)
|
||||
btn2 = gr.Button("Generate")
|
||||
out1 = gr.Video()
|
||||
|
||||
v2v_examples_256 = [
|
||||
[
|
||||
"(masterpiece, best quality, highres:1), harley quinn is dancing, animation, by joshua klein",
|
||||
"../../data/demo/cyber_girl.png",
|
||||
"../../data/demo/video1.mp4",
|
||||
],
|
||||
]
|
||||
with gr.Row():
|
||||
gr.Examples(
|
||||
examples=v2v_examples_256,
|
||||
inputs=[prompt, image, video],
|
||||
outputs=[out],
|
||||
fn=hg_online_v2v_inference,
|
||||
cache_examples=False,
|
||||
)
|
||||
img_edge_ratio.change(
|
||||
fn=limit_shape,
|
||||
inputs=[image, w, h, img_edge_ratio],
|
||||
outputs=[img_edge_ratio_infact, out_w, out_h],
|
||||
)
|
||||
video_length.change(
|
||||
fn=limit_length, inputs=[video_length], outputs=[video_length]
|
||||
)
|
||||
btn2.click(
|
||||
fn=hg_online_v2v_inference,
|
||||
inputs=[
|
||||
prompt,
|
||||
image,
|
||||
video,
|
||||
processor,
|
||||
seed,
|
||||
fps,
|
||||
w,
|
||||
h,
|
||||
video_length,
|
||||
img_edge_ratio_infact,
|
||||
],
|
||||
outputs=out1,
|
||||
)
|
||||
|
||||
|
||||
# Set the IP and port
|
||||
ip_address = "0.0.0.0" # Replace with your desired IP address
|
||||
port_number = 7860 # Replace with your desired port number
|
||||
|
||||
|
||||
demo.queue().launch(
|
||||
share=True, debug=True, server_name=ip_address, server_port=port_number
|
||||
)
|
416
scripts/gradio/app_gradio_space.py
Normal file
416
scripts/gradio/app_gradio_space.py
Normal file
@ -0,0 +1,416 @@
|
||||
import os
|
||||
import time
|
||||
import pdb
|
||||
|
||||
import cuid
|
||||
import gradio as gr
|
||||
import spaces
|
||||
import numpy as np
|
||||
import sys
|
||||
|
||||
from huggingface_hub import snapshot_download
|
||||
import subprocess
|
||||
|
||||
|
||||
ProjectDir = os.path.abspath(os.path.dirname(__file__))
|
||||
CheckpointsDir = os.path.join(ProjectDir, "checkpoints")
|
||||
|
||||
sys.path.insert(0, ProjectDir)
|
||||
sys.path.insert(0, f"{ProjectDir}/MMCM")
|
||||
sys.path.insert(0, f"{ProjectDir}/diffusers/src")
|
||||
sys.path.insert(0, f"{ProjectDir}/controlnet_aux/src")
|
||||
sys.path.insert(0, f"{ProjectDir}/scripts/gradio")
|
||||
|
||||
result = subprocess.run(
|
||||
["pip", "install", "--no-cache-dir", "-U", "openmim"],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
print(result)
|
||||
|
||||
result = subprocess.run(["mim", "install", "mmengine"], capture_output=True, text=True)
|
||||
print(result)
|
||||
|
||||
result = subprocess.run(
|
||||
["mim", "install", "mmcv>=2.0.1"], capture_output=True, text=True
|
||||
)
|
||||
print(result)
|
||||
|
||||
result = subprocess.run(
|
||||
["mim", "install", "mmdet>=3.1.0"], capture_output=True, text=True
|
||||
)
|
||||
print(result)
|
||||
|
||||
result = subprocess.run(
|
||||
["mim", "install", "mmpose>=1.1.0"], capture_output=True, text=True
|
||||
)
|
||||
print(result)
|
||||
ignore_video2video = True
|
||||
|
||||
|
||||
def download_model():
|
||||
if not os.path.exists(CheckpointsDir):
|
||||
print("Checkpoint Not Downloaded, start downloading...")
|
||||
tic = time.time()
|
||||
snapshot_download(
|
||||
repo_id="TMElyralab/MuseV",
|
||||
local_dir=CheckpointsDir,
|
||||
max_workers=8,
|
||||
local_dir_use_symlinks=True,
|
||||
)
|
||||
toc = time.time()
|
||||
print(f"download cost {toc-tic} seconds")
|
||||
else:
|
||||
print("Already download the model.")
|
||||
|
||||
|
||||
download_model() # for huggingface deployment.
|
||||
if not ignore_video2video:
|
||||
from gradio_video2video import online_v2v_inference
|
||||
from gradio_text2video import online_t2v_inference
|
||||
|
||||
|
||||
@spaces.GPU(duration=180)
|
||||
def hf_online_t2v_inference(
|
||||
prompt,
|
||||
image_np,
|
||||
seed,
|
||||
fps,
|
||||
w,
|
||||
h,
|
||||
video_len,
|
||||
img_edge_ratio,
|
||||
):
|
||||
if not isinstance(image_np, np.ndarray): # None
|
||||
raise gr.Error("Need input reference image")
|
||||
return online_t2v_inference(
|
||||
prompt, image_np, seed, fps, w, h, video_len, img_edge_ratio
|
||||
)
|
||||
|
||||
|
||||
@spaces.GPU(duration=180)
|
||||
def hg_online_v2v_inference(
|
||||
prompt,
|
||||
image_np,
|
||||
video,
|
||||
processor,
|
||||
seed,
|
||||
fps,
|
||||
w,
|
||||
h,
|
||||
video_length,
|
||||
img_edge_ratio,
|
||||
):
|
||||
if not isinstance(image_np, np.ndarray): # None
|
||||
raise gr.Error("Need input reference image")
|
||||
return online_v2v_inference(
|
||||
prompt,
|
||||
image_np,
|
||||
video,
|
||||
processor,
|
||||
seed,
|
||||
fps,
|
||||
w,
|
||||
h,
|
||||
video_length,
|
||||
img_edge_ratio,
|
||||
)
|
||||
|
||||
|
||||
def limit_shape(image, input_w, input_h, img_edge_ratio, max_image_edge=960):
|
||||
"""limite generation video shape to avoid gpu memory overflow"""
|
||||
if isinstance(image, np.ndarray) and (input_h == -1 and input_w == -1):
|
||||
input_h, input_w, _ = image.shape
|
||||
h, w, _ = image.shape
|
||||
if img_edge_ratio == 0:
|
||||
img_edge_ratio = 1
|
||||
img_edge_ratio_infact = min(max_image_edge / max(input_h, input_w), img_edge_ratio)
|
||||
# print(
|
||||
# image.shape,
|
||||
# input_w,
|
||||
# input_h,
|
||||
# img_edge_ratio,
|
||||
# max_image_edge,
|
||||
# img_edge_ratio_infact,
|
||||
# )
|
||||
if img_edge_ratio != 1:
|
||||
return (
|
||||
img_edge_ratio_infact,
|
||||
input_w * img_edge_ratio_infact,
|
||||
input_h * img_edge_ratio_infact,
|
||||
)
|
||||
else:
|
||||
return img_edge_ratio_infact, -1, -1
|
||||
|
||||
|
||||
def limit_length(length):
|
||||
"""limite generation video frames numer to avoid gpu memory overflow"""
|
||||
|
||||
if length > 24 * 6:
|
||||
gr.Warning("Length need to smaller than 144, dute to gpu memory limit")
|
||||
length = 24 * 6
|
||||
return length
|
||||
|
||||
|
||||
class ConcatenateBlock(gr.blocks.Block):
|
||||
def __init__(self, options):
|
||||
self.options = options
|
||||
self.current_string = ""
|
||||
|
||||
def update_string(self, new_choice):
|
||||
if new_choice and new_choice not in self.current_string.split(", "):
|
||||
if self.current_string == "":
|
||||
self.current_string = new_choice
|
||||
else:
|
||||
self.current_string += ", " + new_choice
|
||||
return self.current_string
|
||||
|
||||
|
||||
def process_input(new_choice):
|
||||
return concatenate_block.update_string(new_choice), ""
|
||||
|
||||
|
||||
control_options = [
|
||||
"pose",
|
||||
"pose_body",
|
||||
"pose_hand",
|
||||
"pose_face",
|
||||
"pose_hand_body",
|
||||
"pose_hand_face",
|
||||
"dwpose",
|
||||
"dwpose_face",
|
||||
"dwpose_hand",
|
||||
"dwpose_body",
|
||||
"dwpose_body_hand",
|
||||
"canny",
|
||||
"tile",
|
||||
"hed",
|
||||
"hed_scribble",
|
||||
"depth",
|
||||
"pidi",
|
||||
"normal_bae",
|
||||
"lineart",
|
||||
"lineart_anime",
|
||||
"zoe",
|
||||
"sam",
|
||||
"mobile_sam",
|
||||
"leres",
|
||||
"content",
|
||||
"face_detector",
|
||||
]
|
||||
concatenate_block = ConcatenateBlock(control_options)
|
||||
|
||||
|
||||
css = """#input_img {max-width: 1024px !important} #output_vid {max-width: 1024px; max-height: 576px}"""
|
||||
|
||||
|
||||
with gr.Blocks(css=css) as demo:
|
||||
gr.Markdown(
|
||||
"<div align='center'> <h1> MuseV: Infinite-length and High Fidelity Virtual Human Video Generation with Visual Conditioned Parallel Denoising</span> </h1> \
|
||||
<h2 style='font-weight: 450; font-size: 1rem; margin: 0rem'>\
|
||||
</br>\
|
||||
Zhiqiang Xia <sup>*</sup>,\
|
||||
Zhaokang Chen<sup>*</sup>,\
|
||||
Bin Wu<sup>†</sup>,\
|
||||
Chao Li,\
|
||||
Kwok-Wai Hung,\
|
||||
Chao Zhan,\
|
||||
Yingjie He,\
|
||||
Wenjiang Zhou\
|
||||
(<sup>*</sup>Equal Contribution, <sup>†</sup>Corresponding Author, benbinwu@tencent.com)\
|
||||
</br>\
|
||||
Lyra Lab, Tencent Music Entertainment\
|
||||
</h2> \
|
||||
<a style='font-size:18px;color: #000000' href='https://github.com/TMElyralab/MuseV'>[Github Repo]</a>\
|
||||
<a style='font-size:18px;color: #000000'>, which is important to Open-Source projects. Thanks!</a>\
|
||||
<a style='font-size:18px;color: #000000' href=''> [ArXiv(Coming Soon)] </a>\
|
||||
<a style='font-size:18px;color: #000000' href=''> [Project Page(Coming Soon)] </a> \
|
||||
<a style='font-size:18px;color: #000000'>If MuseV is useful, please help star the repo~ </a> </div>"
|
||||
)
|
||||
with gr.Tab("Text to Video"):
|
||||
with gr.Row():
|
||||
with gr.Column():
|
||||
prompt = gr.Textbox(label="Prompt")
|
||||
image = gr.Image(label="VisionCondImage")
|
||||
seed = gr.Number(
|
||||
label="Seed (seed=-1 means that the seeds run each time are different)",
|
||||
value=-1,
|
||||
)
|
||||
video_length = gr.Number(
|
||||
label="Video Length(need smaller than 144,If you want to be able to generate longer videos, run it locally )",
|
||||
value=12,
|
||||
)
|
||||
fps = gr.Number(label="Generate Video FPS", value=6)
|
||||
gr.Markdown(
|
||||
(
|
||||
"If W&H is -1, then use the Reference Image's Size. Size of target video is $(W, H)*img\_edge\_ratio$. \n"
|
||||
"The shorter the image size, the larger the motion amplitude, and the lower video quality.\n"
|
||||
"The longer the W&H, the smaller the motion amplitude, and the higher video quality.\n"
|
||||
"Due to the GPU VRAM limits, the W&H need smaller than 960px"
|
||||
)
|
||||
)
|
||||
with gr.Row():
|
||||
w = gr.Number(label="Width", value=-1)
|
||||
h = gr.Number(label="Height", value=-1)
|
||||
img_edge_ratio = gr.Number(label="img_edge_ratio", value=1.0)
|
||||
with gr.Row():
|
||||
out_w = gr.Number(label="Output Width", value=0, interactive=False)
|
||||
out_h = gr.Number(label="Output Height", value=0, interactive=False)
|
||||
img_edge_ratio_infact = gr.Number(
|
||||
label="img_edge_ratio in fact",
|
||||
value=1.0,
|
||||
interactive=False,
|
||||
)
|
||||
btn1 = gr.Button("Generate")
|
||||
out = gr.Video()
|
||||
# pdb.set_trace()
|
||||
i2v_examples_256 = [
|
||||
[
|
||||
"(masterpiece, best quality, highres:1),(1boy, solo:1),(eye blinks:1.8),(head wave:1.3)",
|
||||
"../../data/images/yongen.jpeg",
|
||||
],
|
||||
[
|
||||
"(masterpiece, best quality, highres:1),(1man, solo:1),(eye blinks:1.8),(head wave:1.3)",
|
||||
"../../data/images/The-Laughing-Cavalier.jpg",
|
||||
],
|
||||
]
|
||||
with gr.Row():
|
||||
gr.Examples(
|
||||
examples=i2v_examples_256,
|
||||
inputs=[prompt, image],
|
||||
outputs=[out],
|
||||
fn=hf_online_t2v_inference,
|
||||
cache_examples=False,
|
||||
)
|
||||
img_edge_ratio.change(
|
||||
fn=limit_shape,
|
||||
inputs=[image, w, h, img_edge_ratio],
|
||||
outputs=[img_edge_ratio_infact, out_w, out_h],
|
||||
)
|
||||
|
||||
video_length.change(
|
||||
fn=limit_length, inputs=[video_length], outputs=[video_length]
|
||||
)
|
||||
|
||||
btn1.click(
|
||||
fn=hf_online_t2v_inference,
|
||||
inputs=[
|
||||
prompt,
|
||||
image,
|
||||
seed,
|
||||
fps,
|
||||
w,
|
||||
h,
|
||||
video_length,
|
||||
img_edge_ratio_infact,
|
||||
],
|
||||
outputs=out,
|
||||
)
|
||||
|
||||
with gr.Tab("Video to Video"):
|
||||
if ignore_video2video:
|
||||
gr.Markdown(
|
||||
(
|
||||
"Due to GPU limit, MuseVDemo now only support Text2Video. If you want to try Video2Video, please run it locally. \n"
|
||||
"We are trying to support video2video in the future. Thanks for your understanding."
|
||||
)
|
||||
)
|
||||
else:
|
||||
with gr.Row():
|
||||
with gr.Column():
|
||||
prompt = gr.Textbox(label="Prompt")
|
||||
gr.Markdown(
|
||||
(
|
||||
"pose of VisionCondImage should be same as of the first frame of the video. "
|
||||
"its better generate target first frame whose pose is same as of first frame of the video with text2image tool, sch as MJ, SDXL."
|
||||
)
|
||||
)
|
||||
image = gr.Image(label="VisionCondImage")
|
||||
video = gr.Video(label="ReferVideo")
|
||||
# radio = gr.inputs.Radio(, label="Select an option")
|
||||
# ctr_button = gr.inputs.Button(label="Add ControlNet List")
|
||||
# output_text = gr.outputs.Textbox()
|
||||
processor = gr.Textbox(
|
||||
label=f"Control Condition. gradio code now only support dwpose_body_hand, use command can support multi of {control_options}",
|
||||
value="dwpose_body_hand",
|
||||
)
|
||||
gr.Markdown("seed=-1 means that seeds are different in every run")
|
||||
seed = gr.Number(
|
||||
label="Seed (seed=-1 means that the seeds run each time are different)",
|
||||
value=-1,
|
||||
)
|
||||
video_length = gr.Number(label="Video Length", value=12)
|
||||
fps = gr.Number(label="Generate Video FPS", value=6)
|
||||
gr.Markdown(
|
||||
(
|
||||
"If W&H is -1, then use the Reference Image's Size. Size of target video is $(W, H)*img\_edge\_ratio$. \n"
|
||||
"The shorter the image size, the larger the motion amplitude, and the lower video quality.\n"
|
||||
"The longer the W&H, the smaller the motion amplitude, and the higher video quality.\n"
|
||||
"Due to the GPU VRAM limits, the W&H need smaller than 2000px"
|
||||
)
|
||||
)
|
||||
with gr.Row():
|
||||
w = gr.Number(label="Width", value=-1)
|
||||
h = gr.Number(label="Height", value=-1)
|
||||
img_edge_ratio = gr.Number(label="img_edge_ratio", value=1.0)
|
||||
|
||||
with gr.Row():
|
||||
out_w = gr.Number(label="Width", value=0, interactive=False)
|
||||
out_h = gr.Number(label="Height", value=0, interactive=False)
|
||||
img_edge_ratio_infact = gr.Number(
|
||||
label="img_edge_ratio in fact",
|
||||
value=1.0,
|
||||
interactive=False,
|
||||
)
|
||||
btn2 = gr.Button("Generate")
|
||||
out1 = gr.Video()
|
||||
|
||||
v2v_examples_256 = [
|
||||
[
|
||||
"(masterpiece, best quality, highres:1), harley quinn is dancing, animation, by joshua klein",
|
||||
"../../data/demo/cyber_girl.png",
|
||||
"../../data/demo/video1.mp4",
|
||||
],
|
||||
]
|
||||
with gr.Row():
|
||||
gr.Examples(
|
||||
examples=v2v_examples_256,
|
||||
inputs=[prompt, image, video],
|
||||
outputs=[out],
|
||||
fn=hg_online_v2v_inference,
|
||||
cache_examples=False,
|
||||
)
|
||||
img_edge_ratio.change(
|
||||
fn=limit_shape,
|
||||
inputs=[image, w, h, img_edge_ratio],
|
||||
outputs=[img_edge_ratio_infact, out_w, out_h],
|
||||
)
|
||||
video_length.change(
|
||||
fn=limit_length, inputs=[video_length], outputs=[video_length]
|
||||
)
|
||||
btn2.click(
|
||||
fn=hg_online_v2v_inference,
|
||||
inputs=[
|
||||
prompt,
|
||||
image,
|
||||
video,
|
||||
processor,
|
||||
seed,
|
||||
fps,
|
||||
w,
|
||||
h,
|
||||
video_length,
|
||||
img_edge_ratio_infact,
|
||||
],
|
||||
outputs=out1,
|
||||
)
|
||||
|
||||
|
||||
# Set the IP and port
|
||||
ip_address = "0.0.0.0" # Replace with your desired IP address
|
||||
port_number = 7860 # Replace with your desired port number
|
||||
|
||||
|
||||
demo.queue().launch(
|
||||
share=True, debug=True, server_name=ip_address, server_port=port_number
|
||||
)
|
@ -1,223 +0,0 @@
|
||||
import os
|
||||
import time
|
||||
import pdb
|
||||
|
||||
import cuid
|
||||
import gradio as gr
|
||||
|
||||
|
||||
from huggingface_hub import snapshot_download
|
||||
|
||||
ProjectDir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../.."))
|
||||
CheckpointsDir = os.path.join(ProjectDir, "checkpoints")
|
||||
|
||||
|
||||
def download_model():
|
||||
if not os.path.exists(CheckpointsDir):
|
||||
print("Checkpoint Not Downloaded, start downloading...")
|
||||
tic = time.time()
|
||||
snapshot_download(
|
||||
repo_id="TMElyralab/MuseV",
|
||||
local_dir=CheckpointsDir,
|
||||
max_workers=8,
|
||||
)
|
||||
toc = time.time()
|
||||
print(f"download cost {toc-tic} seconds")
|
||||
else:
|
||||
print("Already download the model.")
|
||||
|
||||
|
||||
download_model() # for huggingface deployment.
|
||||
|
||||
from gradio_video2video import online_v2v_inference
|
||||
from gradio_text2video import online_t2v_inference
|
||||
|
||||
|
||||
def update_shape(image):
|
||||
if image != None:
|
||||
h, w, _ = image.shape
|
||||
else:
|
||||
h, w = 768, 512
|
||||
return w, h
|
||||
|
||||
|
||||
class ConcatenateBlock(gr.blocks.Block):
|
||||
def __init__(self, options):
|
||||
self.options = options
|
||||
self.current_string = ""
|
||||
|
||||
def update_string(self, new_choice):
|
||||
if new_choice and new_choice not in self.current_string.split(", "):
|
||||
if self.current_string == "":
|
||||
self.current_string = new_choice
|
||||
else:
|
||||
self.current_string += ", " + new_choice
|
||||
return self.current_string
|
||||
|
||||
|
||||
def process_input(new_choice):
|
||||
return concatenate_block.update_string(new_choice), ""
|
||||
|
||||
|
||||
control_options = [
|
||||
"pose",
|
||||
"pose_body",
|
||||
"pose_hand",
|
||||
"pose_face",
|
||||
"pose_hand_body",
|
||||
"pose_hand_face",
|
||||
"dwpose",
|
||||
"dwpose_face",
|
||||
"dwpose_hand",
|
||||
"dwpose_body",
|
||||
"dwpose_body_hand",
|
||||
"canny",
|
||||
"tile",
|
||||
"hed",
|
||||
"hed_scribble",
|
||||
"depth",
|
||||
"pidi",
|
||||
"normal_bae",
|
||||
"lineart",
|
||||
"lineart_anime",
|
||||
"zoe",
|
||||
"sam",
|
||||
"mobile_sam",
|
||||
"leres",
|
||||
"content",
|
||||
"face_detector",
|
||||
]
|
||||
concatenate_block = ConcatenateBlock(control_options)
|
||||
|
||||
|
||||
css = """#input_img {max-width: 1024px !important} #output_vid {max-width: 1024px; max-height: 576px}"""
|
||||
|
||||
|
||||
with gr.Blocks(css=css) as demo:
|
||||
gr.Markdown(
|
||||
"<div align='center'> <h1> MuseV: Infinite-length and High Fidelity Virtual Human Video Generation with Visual Conditioned Parallel Denoising</span> </h1> \
|
||||
<h2 style='font-weight: 450; font-size: 1rem; margin: 0rem'>\
|
||||
</br>\
|
||||
Zhiqiang Xia <sup>*</sup>,\
|
||||
Zhaokang Chen<sup>*</sup>,\
|
||||
Bin Wu<sup>†</sup>,\
|
||||
Chao Li,\
|
||||
Kwok-Wai Hung,\
|
||||
Chao Zhan,\
|
||||
Yingjie He,\
|
||||
Wenjiang Zhou\
|
||||
(<sup>*</sup>Equal Contribution, <sup>†</sup>Corresponding Author, benbinwu@tencent.com)\
|
||||
</br>\
|
||||
Lyra Lab, Tencent Music Entertainment\
|
||||
</h2> \
|
||||
<a style='font-size:18px;color: #000000' href='https://github.com/TMElyralab/MuseV'>[Github Repo]</a>\
|
||||
<a style='font-size:18px;color: #000000'>, which is important to Open-Source projects. Thanks!</a>\
|
||||
<a style='font-size:18px;color: #000000' href=''> [ArXiv(Coming Soon)] </a>\
|
||||
<a style='font-size:18px;color: #000000' href=''> [Project Page(Coming Soon)] </a> \
|
||||
<a style='font-size:18px;color: #000000'>If MuseV is useful, please help star the repo~ </a> </div>"
|
||||
)
|
||||
with gr.Tab("Text to Video"):
|
||||
with gr.Row():
|
||||
with gr.Column():
|
||||
prompt = gr.Textbox(label="Prompt")
|
||||
image = gr.Image(label="VisionCondImage")
|
||||
gr.Markdown("seed=-1 means that the seeds run each time are different")
|
||||
seed = gr.Number(label="Seed", value=-1)
|
||||
video_length = gr.Number(label="Video Length", value=12)
|
||||
fps = gr.Number(label="Generate Video FPS", value=6)
|
||||
gr.Markdown(
|
||||
(
|
||||
"If W&H is -1, then use the Reference Image's Size. Size of target video is $(W, H)*img\_edge\_ratio$. \n"
|
||||
"The shorter the image size, the larger the motion amplitude, and the lower video quality.\n"
|
||||
"The longer the W&H, the smaller the motion amplitude, and the higher video quality"
|
||||
)
|
||||
)
|
||||
with gr.Row():
|
||||
w = gr.Number(label="Width", value=-1)
|
||||
h = gr.Number(label="Height", value=-1)
|
||||
img_edge_ratio = gr.Number(label="img_edge_ratio", value=1.0)
|
||||
|
||||
btn1 = gr.Button("Generate")
|
||||
out = gr.outputs.Video()
|
||||
# pdb.set_trace()
|
||||
with gr.Row():
|
||||
board = gr.Dataframe(
|
||||
value=[["", "", ""]] * 3,
|
||||
interactive=False,
|
||||
type="array",
|
||||
label="Demo Video",
|
||||
)
|
||||
|
||||
# image.change(fn=update_shape, inputs=[image], outputs=[w, h])
|
||||
|
||||
btn1.click(
|
||||
fn=online_t2v_inference,
|
||||
inputs=[prompt, image, seed, fps, w, h, video_length, img_edge_ratio],
|
||||
outputs=out,
|
||||
)
|
||||
|
||||
with gr.Tab("Video to Video"):
|
||||
with gr.Row():
|
||||
with gr.Column():
|
||||
prompt = gr.Textbox(label="Prompt")
|
||||
gr.Markdown(
|
||||
(
|
||||
"pose of VisionCondImage should be same as of the first frame of the video. "
|
||||
"its better generate target first frame whose pose is same as of first frame of the video with text2image tool, sch as MJ, SDXL."
|
||||
)
|
||||
)
|
||||
image = gr.Image(label="VisionCondImage")
|
||||
video = gr.Video(label="ReferVideo")
|
||||
# radio = gr.inputs.Radio(, label="Select an option")
|
||||
# ctr_button = gr.inputs.Button(label="Add ControlNet List")
|
||||
# output_text = gr.outputs.Textbox()
|
||||
processor = gr.Textbox(
|
||||
label=f"Control Condition. gradio code now only support dwpose_body_hand, use command can support multi of {control_options}",
|
||||
value="dwpose_body_hand",
|
||||
)
|
||||
gr.Markdown("seed=-1 means that seeds are different in every run")
|
||||
seed = gr.Number(label="Seed", value=-1)
|
||||
video_length = gr.Number(label="Video Length", value=12)
|
||||
fps = gr.Number(label="Generate Video FPS", value=6)
|
||||
gr.Markdown(
|
||||
(
|
||||
"If W&H is -1, then use the Reference Image's Size. Size of target video is $(W, H)*img\_edge\_ratio$. \n"
|
||||
"The shorter the image size, the larger the motion amplitude, and the lower video quality. \n"
|
||||
"The longer the W&H, the smaller the motion amplitude, and the higher video quality. "
|
||||
)
|
||||
)
|
||||
with gr.Row():
|
||||
w = gr.Number(label="Width", value=-1)
|
||||
h = gr.Number(label="Height", value=-1)
|
||||
img_edge_ratio = gr.Number(label="img_edge_ratio", value=1.0)
|
||||
|
||||
btn2 = gr.Button("Generate")
|
||||
out1 = gr.outputs.Video()
|
||||
# image.change(fn=update_shape, inputs=[image], outputs=[w, h])
|
||||
|
||||
btn2.click(
|
||||
fn=online_v2v_inference,
|
||||
inputs=[
|
||||
prompt,
|
||||
image,
|
||||
video,
|
||||
processor,
|
||||
seed,
|
||||
fps,
|
||||
w,
|
||||
h,
|
||||
video_length,
|
||||
img_edge_ratio,
|
||||
],
|
||||
outputs=out1,
|
||||
)
|
||||
|
||||
|
||||
# Set the IP and port
|
||||
ip_address = "0.0.0.0" # Replace with your desired IP address
|
||||
port_number = 7860 # Replace with your desired port number
|
||||
|
||||
|
||||
demo.queue().launch(
|
||||
share=False, debug=True, server_name=ip_address, server_port=port_number
|
||||
)
|
@ -45,10 +45,8 @@ from musev.models.unet_loader import load_unet_by_name
|
||||
from musev.utils.util import save_videos_grid_with_opencv
|
||||
from musev import logger
|
||||
|
||||
need_load_predictor = False
|
||||
if need_load_predictor:
|
||||
video_sd_predictor = None
|
||||
else:
|
||||
use_v2v_predictor = False
|
||||
if use_v2v_predictor:
|
||||
from gradio_video2video import sd_predictor as video_sd_predictor
|
||||
|
||||
logger.setLevel("INFO")
|
||||
@ -464,7 +462,7 @@ def read_image_and_name(path):
|
||||
return images, name
|
||||
|
||||
|
||||
if referencenet_model_name is not None and need_load_predictor:
|
||||
if referencenet_model_name is not None and not use_v2v_predictor:
|
||||
referencenet = load_referencenet_by_name(
|
||||
model_name=referencenet_model_name,
|
||||
# sd_model=sd_model_path,
|
||||
@ -476,7 +474,7 @@ else:
|
||||
referencenet = None
|
||||
referencenet_model_name = "no"
|
||||
|
||||
if vision_clip_extractor_class_name is not None and need_load_predictor:
|
||||
if vision_clip_extractor_class_name is not None and not use_v2v_predictor:
|
||||
vision_clip_extractor = load_vision_clip_encoder_by_name(
|
||||
ip_image_encoder=vision_clip_model_path,
|
||||
vision_clip_extractor_class_name=vision_clip_extractor_class_name,
|
||||
@ -488,7 +486,7 @@ else:
|
||||
vision_clip_extractor = None
|
||||
logger.info(f"vision_clip_extractor, None")
|
||||
|
||||
if ip_adapter_model_name is not None and need_load_predictor:
|
||||
if ip_adapter_model_name is not None and not use_v2v_predictor:
|
||||
ip_adapter_image_proj = load_ip_adapter_image_proj_by_name(
|
||||
model_name=ip_adapter_model_name,
|
||||
ip_image_encoder=ip_adapter_model_params_dict.get(
|
||||
@ -526,11 +524,11 @@ for model_name, sd_model_params in sd_model_params_dict.items():
|
||||
strict=not (facein_model_name is not None),
|
||||
need_t2i_ip_adapter_face=ip_adapter_face_model_name is not None,
|
||||
)
|
||||
if need_load_predictor
|
||||
if not use_v2v_predictor
|
||||
else None
|
||||
)
|
||||
|
||||
if facein_model_name is not None and need_load_predictor:
|
||||
if facein_model_name is not None and not use_v2v_predictor:
|
||||
(
|
||||
face_emb_extractor,
|
||||
facein_image_proj,
|
||||
@ -552,7 +550,7 @@ for model_name, sd_model_params in sd_model_params_dict.items():
|
||||
face_emb_extractor = None
|
||||
facein_image_proj = None
|
||||
|
||||
if ip_adapter_face_model_name is not None and need_load_predictor:
|
||||
if ip_adapter_face_model_name is not None and not use_v2v_predictor:
|
||||
(
|
||||
ip_adapter_face_emb_extractor,
|
||||
ip_adapter_face_image_proj,
|
||||
@ -595,10 +593,10 @@ for model_name, sd_model_params in sd_model_params_dict.items():
|
||||
ip_adapter_face_emb_extractor=ip_adapter_face_emb_extractor,
|
||||
ip_adapter_face_image_proj=ip_adapter_face_image_proj,
|
||||
)
|
||||
if need_load_predictor
|
||||
if not use_v2v_predictor
|
||||
else video_sd_predictor
|
||||
)
|
||||
if not need_load_predictor:
|
||||
if use_v2v_predictor:
|
||||
print(
|
||||
"text2video use video_sd_predictor, sd_predictor type is ",
|
||||
type(sd_predictor),
|
||||
|
Loading…
x
Reference in New Issue
Block a user