MuseV/scripts/gradio/app_space.py

import os
import time
import pdb

import cuid
import gradio as gr


from huggingface_hub import snapshot_download

ProjectDir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../.."))
CheckpointsDir = os.path.join(ProjectDir, "checkpoints")


def download_model():
    if not os.path.exists(CheckpointsDir):
        print("Checkpoint Not Downloaded, start downloading...")
        tic = time.time()
        snapshot_download(
            repo_id="TMElyralab/MuseV",
            local_dir=CheckpointsDir,
            max_workers=8,
        )
        toc = time.time()
        print(f"download cost {toc-tic} seconds")
    else:
        print("Already download the model.")


download_model()  # for huggingface deployment.

from gradio_video2video import online_v2v_inference
from gradio_text2video import online_t2v_inference


def update_shape(image):
    if image != None:
        h, w, _ = image.shape
    else:
        h, w = 768, 512
    return w, h


class ConcatenateBlock(gr.blocks.Block):
    def __init__(self, options):
        self.options = options
        self.current_string = ""

    def update_string(self, new_choice):
        if new_choice and new_choice not in self.current_string.split(", "):
            if self.current_string == "":
                self.current_string = new_choice
            else:
                self.current_string += ", " + new_choice
        return self.current_string


def process_input(new_choice):
    return concatenate_block.update_string(new_choice), ""


control_options = [
    "pose",
    "pose_body",
    "pose_hand",
    "pose_face",
    "pose_hand_body",
    "pose_hand_face",
    "dwpose",
    "dwpose_face",
    "dwpose_hand",
    "dwpose_body",
    "dwpose_body_hand",
    "canny",
    "tile",
    "hed",
    "hed_scribble",
    "depth",
    "pidi",
    "normal_bae",
    "lineart",
    "lineart_anime",
    "zoe",
    "sam",
    "mobile_sam",
    "leres",
    "content",
    "face_detector",
]
concatenate_block = ConcatenateBlock(control_options)


css = """#input_img {max-width: 1024px !important} #output_vid {max-width: 1024px; max-height: 576px}"""


with gr.Blocks(css=css) as demo:
    gr.Markdown(
        "<div align='center'> <h1> MuseV: Infinite-length and High Fidelity Virtual Human Video Generation with Visual Conditioned Parallel Denoising</span> </h1> \
                    <h2 style='font-weight: 450; font-size: 1rem; margin: 0rem'>\
                    </br>\
                    Zhiqiang Xia <sup>*</sup>,\
                    Zhaokang Chen<sup>*</sup>,\
                    Bin Wu<sup>†</sup>,\
                    Chao Li,\
                    Kwok-Wai Hung,\
                    Chao Zhan,\
                    Yingjie He,\
                    Wenjiang Zhou\
                    (<sup>*</sup>Equal Contribution,  <sup>†</sup>Corresponding Author, benbinwu@tencent.com)\
                    </br>\
                    Lyra Lab, Tencent Music Entertainment\
                </h2> \
                <a style='font-size:18px;color: #000000' href='https://github.com/TMElyralab/MuseV'>[Github Repo]</a>\
                <a style='font-size:18px;color: #000000'>, which is important to Open-Source projects. Thanks!</a>\
                <a style='font-size:18px;color: #000000' href=''> [ArXiv(Coming Soon)] </a>\
                <a style='font-size:18px;color: #000000' href=''> [Project Page(Coming Soon)] </a> \
                <a style='font-size:18px;color: #000000'>If MuseV is useful, please help star the repo~ </a> </div>"
    )
    with gr.Tab("Text to Video"):
        with gr.Row():
            with gr.Column():
                prompt = gr.Textbox(label="Prompt")
                image = gr.Image(label="VisionCondImage")
                gr.Markdown("seed=-1 means that the seeds run each time are different")
                seed = gr.Number(label="Seed", value=-1)
                video_length = gr.Number(label="Video Length", value=12)
                fps = gr.Number(label="Generate Video FPS", value=6)
                gr.Markdown(
                    (
                        "If W&H is -1, then use the Reference Image's Size. Size of target video is $(W, H)*img\_edge\_ratio$. \n"
                        "The shorter the image size, the larger the motion amplitude, and the lower video quality.\n"
                        "The longer the W&H, the smaller the motion amplitude, and the higher video quality"
                    )
                )
                with gr.Row():
                    w = gr.Number(label="Width", value=-1)
                    h = gr.Number(label="Height", value=-1)
                    img_edge_ratio = gr.Number(label="img_edge_ratio", value=1.0)

                btn1 = gr.Button("Generate")
            out = gr.outputs.Video()
            # pdb.set_trace()
        with gr.Row():
            board = gr.Dataframe(
                value=[["", "", ""]] * 3,
                interactive=False,
                type="array",
                label="Demo Video",
            )

        # image.change(fn=update_shape, inputs=[image], outputs=[w, h])

        btn1.click(
            fn=online_t2v_inference,
            inputs=[prompt, image, seed, fps, w, h, video_length, img_edge_ratio],
            outputs=out,
        )

    with gr.Tab("Video to Video"):
        with gr.Row():
            with gr.Column():
                prompt = gr.Textbox(label="Prompt")
                gr.Markdown(
                    (
                        "pose of VisionCondImage should be same as of the first frame of the video. "
                        "its better generate target first frame whose pose is same as of first frame of the video with text2image tool, sch as MJ, SDXL."
                    )
                )
                image = gr.Image(label="VisionCondImage")
                video = gr.Video(label="ReferVideo")
                # radio = gr.inputs.Radio(, label="Select an option")
                # ctr_button = gr.inputs.Button(label="Add ControlNet List")
                # output_text = gr.outputs.Textbox()
                processor = gr.Textbox(
                    label=f"Control Condition. gradio code now only support dwpose_body_hand, use command can support multi of {control_options}",
                    value="dwpose_body_hand",
                )
                gr.Markdown("seed=-1 means that seeds are different in every run")
                seed = gr.Number(label="Seed", value=-1)
                video_length = gr.Number(label="Video Length", value=12)
                fps = gr.Number(label="Generate Video FPS", value=6)
                gr.Markdown(
                    (
                        "If W&H is -1, then use the Reference Image's Size. Size of target video is $(W, H)*img\_edge\_ratio$. \n"
                        "The shorter the image size, the larger the motion amplitude, and the lower video quality. \n"
                        "The longer the W&H, the smaller the motion amplitude, and the higher video quality. "
                    )
                )
                with gr.Row():
                    w = gr.Number(label="Width", value=-1)
                    h = gr.Number(label="Height", value=-1)
                    img_edge_ratio = gr.Number(label="img_edge_ratio", value=1.0)

                btn2 = gr.Button("Generate")
            out1 = gr.outputs.Video()
        # image.change(fn=update_shape, inputs=[image], outputs=[w, h])

        btn2.click(
            fn=online_v2v_inference,
            inputs=[
                prompt,
                image,
                video,
                processor,
                seed,
                fps,
                w,
                h,
                video_length,
                img_edge_ratio,
            ],
            outputs=out1,
        )


# Set the IP and port
ip_address = "0.0.0.0"  # Replace with your desired IP address
port_number = 7860  # Replace with your desired port number


demo.queue().launch(
    share=False, debug=True, server_name=ip_address, server_port=port_number
)