mirror of
https://github.com/OpenBMB/MiniCPM-V.git
synced 2026-02-04 17:59:18 +08:00
add model init in Multimodal Live Streaming code (#733)
This commit is contained in:
18
README.md
18
README.md
@@ -2197,8 +2197,8 @@ from moviepy.editor import VideoFileClip
|
|||||||
import tempfile
|
import tempfile
|
||||||
import librosa
|
import librosa
|
||||||
import soundfile as sf
|
import soundfile as sf
|
||||||
|
import torch
|
||||||
## make sure The model has been initialized and `model.init_tts()` has been executed
|
from transformers import AutoModel, AutoTokenizer
|
||||||
|
|
||||||
def get_video_chunk_content(video_path, flatten=True):
|
def get_video_chunk_content(video_path, flatten=True):
|
||||||
video = VideoFileClip(video_path)
|
video = VideoFileClip(video_path)
|
||||||
@@ -2223,7 +2223,19 @@ def get_video_chunk_content(video_path, flatten=True):
|
|||||||
|
|
||||||
return contents
|
return contents
|
||||||
|
|
||||||
video_path="/path/to/video"
|
|
||||||
|
model = AutoModel.from_pretrained('openbmb/MiniCPM-o-2_6', trust_remote_code=True,
|
||||||
|
attn_implementation='sdpa', torch_dtype=torch.bfloat16)
|
||||||
|
model = model.eval().cuda()
|
||||||
|
tokenizer = AutoTokenizer.from_pretrained('openbmb/MiniCPM-o-2_6', trust_remote_code=True)
|
||||||
|
|
||||||
|
model.init_tts()
|
||||||
|
|
||||||
|
# If you are using an older version of PyTorch, you might encounter this issue "weight_norm_fwd_first_dim_kernel" not implemented for 'BFloat16', Please convert the TTS to float32 type.
|
||||||
|
# model.tts.float()
|
||||||
|
|
||||||
|
# https://huggingface.co/openbmb/MiniCPM-o-2_6/blob/main/assets/Skiing.mp4
|
||||||
|
video_path="assets/Skiing.mp4"
|
||||||
sys_msg = model.get_sys_prompt(mode='omni', language='en')
|
sys_msg = model.get_sys_prompt(mode='omni', language='en')
|
||||||
# if use voice clone prompt, please set ref_audio
|
# if use voice clone prompt, please set ref_audio
|
||||||
# ref_audio_path = '/path/to/ref_audio'
|
# ref_audio_path = '/path/to/ref_audio'
|
||||||
|
|||||||
18
README_zh.md
18
README_zh.md
@@ -2179,8 +2179,8 @@ from moviepy.editor import VideoFileClip
|
|||||||
import tempfile
|
import tempfile
|
||||||
import librosa
|
import librosa
|
||||||
import soundfile as sf
|
import soundfile as sf
|
||||||
|
import torch
|
||||||
## make sure The model has been initialized and `model.init_tts()` has been executed
|
from transformers import AutoModel, AutoTokenizer
|
||||||
|
|
||||||
def get_video_chunk_content(video_path, flatten=True):
|
def get_video_chunk_content(video_path, flatten=True):
|
||||||
video = VideoFileClip(video_path)
|
video = VideoFileClip(video_path)
|
||||||
@@ -2205,7 +2205,19 @@ def get_video_chunk_content(video_path, flatten=True):
|
|||||||
|
|
||||||
return contents
|
return contents
|
||||||
|
|
||||||
video_path="/path/to/video"
|
|
||||||
|
model = AutoModel.from_pretrained('openbmb/MiniCPM-o-2_6', trust_remote_code=True,
|
||||||
|
attn_implementation='sdpa', torch_dtype=torch.bfloat16)
|
||||||
|
model = model.eval().cuda()
|
||||||
|
tokenizer = AutoTokenizer.from_pretrained('openbmb/MiniCPM-o-2_6', trust_remote_code=True)
|
||||||
|
|
||||||
|
model.init_tts()
|
||||||
|
|
||||||
|
# If you are using an older version of PyTorch, you might encounter this issue "weight_norm_fwd_first_dim_kernel" not implemented for 'BFloat16', Please convert the TTS to float32 type.
|
||||||
|
# model.tts.float()
|
||||||
|
|
||||||
|
# https://huggingface.co/openbmb/MiniCPM-o-2_6/blob/main/assets/Skiing.mp4
|
||||||
|
video_path="assets/Skiing.mp4"
|
||||||
sys_msg = model.get_sys_prompt(mode='omni', language='en')
|
sys_msg = model.get_sys_prompt(mode='omni', language='en')
|
||||||
# if use voice clone prompt, please set ref_audio
|
# if use voice clone prompt, please set ref_audio
|
||||||
# ref_audio_path = '/path/to/ref_audio'
|
# ref_audio_path = '/path/to/ref_audio'
|
||||||
|
|||||||
Reference in New Issue
Block a user