From 259d54ed0ade2428c8fc31fba97f96288bd23a81 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Sun, 18 Aug 2024 23:53:07 +0800 Subject: [PATCH 1/6] Update web_demo_streamlit-2_5.py Fixed a bug where imagefile was not initialized when no image was passed in --- web_demo_streamlit-2_5.py | 1 + 1 file changed, 1 insertion(+) diff --git a/web_demo_streamlit-2_5.py b/web_demo_streamlit-2_5.py index bf100ce..4cee58c 100644 --- a/web_demo_streamlit-2_5.py +++ b/web_demo_streamlit-2_5.py @@ -88,6 +88,7 @@ if user_text: # Generate reply using the model model = st.session_state.model tokenizer = st.session_state.tokenizer + imagefile = None with st.chat_message(A_NAME, avatar="assistant"): # If the previous message contains an image, pass the image to the model From c9f5cd4b0063f0581299d3f1afcefc9c70ff59ff Mon Sep 17 00:00:00 2001 From: JamePeng Date: Mon, 19 Aug 2024 00:45:33 +0800 Subject: [PATCH 2/6] Update streamlit implementation for MiniCPM-V 2.6 Compared with the streamlit implementation of 2.5, this code implementation can better play the new multi-modal capabilities of 2.6: 1. The application supports the upload and processing of text, single image, multiple images and videos, and can process different types of input according to the mode selected by the user. 2. Video frame extraction and encoding: In video mode, frames are extracted from the uploaded video through the decord library and uniformly sampled so that the model can process and generate responses. More detailed and clear variables and annotations. Convenient for learning and use 3. File upload and processing: Support users to upload pictures and videos, and perform corresponding processing according to different modes, such as displaying pictures in single picture mode, displaying multiple pictures in multi-picture mode, and processing video frames in video mode. You can switch back and forth between different media. 4. Tip: You can use the command `streamlit run ./web_demo_streamlit-minicpmv2_6.py --server.maxUploadSize 1024` to adjust the maximum upload size to 1024MB or larger files. The default 200MB limit of Streamlit's file_uploader component might be insufficient for video-based interactions. Adjust the size based on your GPU memory usage. --- web_demo_streamlit-minicpmv2_6.py | 263 ++++++++++++++++++++++++++++++ 1 file changed, 263 insertions(+) create mode 100644 web_demo_streamlit-minicpmv2_6.py diff --git a/web_demo_streamlit-minicpmv2_6.py b/web_demo_streamlit-minicpmv2_6.py new file mode 100644 index 0000000..882450e --- /dev/null +++ b/web_demo_streamlit-minicpmv2_6.py @@ -0,0 +1,263 @@ +import os.path + +import streamlit as st +import torch +from PIL import Image +from decord import VideoReader, cpu +from transformers import AutoModel, AutoTokenizer + +# Model path +model_path = "openbmb/MiniCPM-V-2_6" +upload_path = ".\\uploads" + +# User and assistant names +U_NAME = "User" +A_NAME = "Assistant" + +# Set page configuration +st.set_page_config( + page_title="MiniCPM-V-2_6 Streamlit", + page_icon=":robot:", + layout="wide" +) + + +# Load model and tokenizer +@st.cache_resource +def load_model_and_tokenizer(): + print(f"load_model_and_tokenizer from {model_path}") + model = (AutoModel.from_pretrained(model_path, trust_remote_code=True, attn_implementation='sdpa'). + to(dtype=torch.bfloat16)) + tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) + return model, tokenizer + + +# Initialize session state +if 'model' not in st.session_state: + st.session_state.model, st.session_state.tokenizer = load_model_and_tokenizer() + st.session_state.model.eval().cuda() + print("model and tokenizer had loaded completed!") + +# Initialize session state +if 'chat_history' not in st.session_state: + st.session_state.chat_history = [] + st.session_state.uploaded_image_list = [] + st.session_state.uploaded_image_num = 0 + st.session_state.uploaded_video_list = [] + st.session_state.uploaded_video_num = 0 + st.session_state.response = "" + +# Sidebar settings +sidebar_name = st.sidebar.title("MiniCPM-V-2_6 Streamlit") +max_length = st.sidebar.slider("max_length", 0, 4096, 2048, step=2) +repetition_penalty = st.sidebar.slider("repetition_penalty", 0.0, 2.0, 1.05, step=0.01) +top_k = st.sidebar.slider("top_k", 0, 100, 100, step=1) +top_p = st.sidebar.slider("top_p", 0.0, 1.0, 0.8, step=0.01) +temperature = st.sidebar.slider("temperature", 0.0, 1.0, 0.7, step=0.01) + +# Button to clear session history +buttonClean = st.sidebar.button("Clearing session history", key="clean") +if buttonClean: + # Reset the session state history and uploaded file lists + st.session_state.chat_history = [] + st.session_state.uploaded_image_list = [] + st.session_state.uploaded_image_num = 0 + st.session_state.uploaded_video_list = [] + st.session_state.uploaded_video_num = 0 + st.session_state.response = "" + + # If using GPU, clear the CUDA cache to free up memory + if torch.cuda.is_available(): + torch.cuda.empty_cache() + + # Rerun to refresh the interface + st.rerun() + +# Display chat history +for i, message in enumerate(st.session_state.chat_history): + if message["role"] == "user": + with st.chat_message(name="user", avatar="user"): + if message["image"] is not None: + st.image(message["image"], caption='User uploaded images', width=512, use_column_width=False) + continue + elif message["video"] is not None: + st.video(message["video"], format="video/mp4", loop=False, autoplay=False, muted=True) + continue + elif message["content"] is not None: + st.markdown(message["content"]) + else: + with st.chat_message(name="model", avatar="assistant"): + st.markdown(message["content"]) + +# Select mode +selected_mode = st.sidebar.selectbox("Select Mode", ["Text", "Single Image", "Multiple Images", "Video"]) + +# Supported image file extensions +image_type = ['.jpg', '.jpeg', '.png', '.bmp', '.tiff', '.webp'] + +if selected_mode == "Single Image": + # Single Image Mode + uploaded_image = st.sidebar.file_uploader("Upload a Single Image", key=1, type=image_type, + accept_multiple_files=False) + if uploaded_image is not None: + st.image(uploaded_image, caption='User Uploaded Image', width=512, use_column_width=False) + # Add the uploaded image to the chat history + st.session_state.chat_history.append({"role": "user", "content": None, "image": uploaded_image, "video": None}) + st.session_state.uploaded_image_list = [uploaded_image] + st.session_state.uploaded_image_num = 1 + +if selected_mode == "Multiple Images": + # Multiple Images Mode + uploaded_image_list = st.sidebar.file_uploader("Upload Multiple Images", key=2, type=image_type, + accept_multiple_files=True) + uploaded_image_num = len(uploaded_image_list) + + if uploaded_image_list is not None and uploaded_image_num > 0: + for img in uploaded_image_list: + st.image(img, caption='User Uploaded Image', width=512, use_column_width=False) + # Add the uploaded images to the chat history + st.session_state.chat_history.append({"role": "user", "content": None, "image": img, "video": None}) + # Update the uploaded image list and count in st.session_state + st.session_state.uploaded_image_list = uploaded_image_list + st.session_state.uploaded_image_num = uploaded_image_num + +# Supported video format suffixes +video_type = ['.mp4', '.mkv', '.mov', '.avi', '.flv', '.wmv', '.webm', '.m4v'] + +# Tip: You can use the command `streamlit run ./web_demo_streamlit-minicpmv2_6.py --server.maxUploadSize 1024` +# to adjust the maximum upload size to 1024MB or larger files. +# The default 200MB limit of Streamlit's file_uploader component might be insufficient for video-based interactions. +# Adjust the size based on your GPU memory usage. + +if selected_mode == "Video": + # 单个视频模态 + uploaded_video = st.sidebar.file_uploader("Upload a single video file", key=3, type=video_type, + accept_multiple_files=False) + if uploaded_video is not None: + st.video(uploaded_video, format="video/mp4", loop=False, autoplay=False, muted=True) + st.session_state.chat_history.append({"role": "user", "content": None, "image": None, "video": uploaded_video}) + + uploaded_video_path = os.path.join(upload_path, uploaded_video.name) + with open(uploaded_video_path, "wb") as vf: + vf.write(uploaded_video.getvalue()) + st.session_state.uploaded_video_list = [uploaded_video_path] + st.session_state.uploaded_video_num = 1 + +MAX_NUM_FRAMES = 64 # if cuda OOM set a smaller number + + +# Encodes a video by sampling frames at a fixed rate and converting them to image arrays. +def encode_video(video_path): + def uniform_sample(frame_indices, num_samples): + # Calculate sampling interval and uniformly sample frame indices + gap = len(frame_indices) / num_samples + sampled_idxs = [int(i * gap + gap / 2) for i in range(num_samples)] + return [frame_indices[i] for i in sampled_idxs] + + # Read the video and set the decoder's context to CPU + vr = VideoReader(video_path, ctx=cpu(0)) + + # Calculate the sampling interval to sample video frames at 1 FPS + sample_fps = round(vr.get_avg_fps() / 1) # Use integer FPS + frame_idx = list(range(0, len(vr), sample_fps)) + + # If the number of sampled frames exceeds the maximum limit, uniformly sample them + if len(frame_idx) > MAX_NUM_FRAMES: + frame_idx = uniform_sample(frame_idx, MAX_NUM_FRAMES) + + # Retrieve the sampled frames and convert them to image arrays + frames = vr.get_batch(frame_idx).asnumpy() + frames = [Image.fromarray(frame.astype('uint8')) for frame in frames] + + print('Number of frames:', len(frames)) + return frames + + +# User input box +user_text = st.chat_input("Enter your question") +if user_text: + # Display user input and save it to session history + with st.chat_message(U_NAME, avatar="user"): + st.session_state.chat_history.append({ + "role": "user", + "content": user_text, + "image": None, + "video": None + }) + st.markdown(f"{U_NAME}: {user_text}") + + # Generate responses using the model + model = st.session_state.model + tokenizer = st.session_state.tokenizer + content_list = [] # Store the content (text or image) that will be passed into the model + imageFile = None + + with st.chat_message(A_NAME, avatar="assistant"): + # Handle different inputs depending on the mode selected by the user + if selected_mode == "Single Image": + # Single image mode: pass in the last uploaded image + print("Single Images mode in use") + if len(st.session_state.chat_history) > 1 and len(st.session_state.uploaded_image_list) >= 1: + uploaded_image = st.session_state.uploaded_image_list[-1] + if uploaded_image: + imageFile = Image.open(uploaded_image).convert('RGB') + content_list.append(imageFile) + else: + print("Single Images mode: No image found") + + elif selected_mode == "Multiple Images": + # Multi-image mode: pass in all the images uploaded last time + print("Multiple Images mode in use") + if len(st.session_state.chat_history) > 1 and st.session_state.uploaded_image_num >= 1: + for uploaded_image in st.session_state.uploaded_image_list: + imageFile = Image.open(uploaded_image).convert('RGB') + content_list.append(imageFile) + else: + print("Multiple Images mode: No image found") + + elif selected_mode == "Video": + # Video mode: pass in slice frames of uploaded video + print("Video mode in use") + if len(st.session_state.chat_history) > 1 and st.session_state.uploaded_video_num == 1: + uploaded_video_path = st.session_state.uploaded_video_list[-1] + if uploaded_video_path: + frames = encode_video(uploaded_video_path) + else: + print("Video Mode: No video found") + + # Defining model parameters + params = { + 'sampling': True, + 'top_p': top_p, + 'top_k': top_k, + 'temperature': temperature, + 'repetition_penalty': repetition_penalty, + "max_new_tokens": max_length, + "stream": True + } + + # Set different input parameters depending on whether to upload a video + if st.session_state.uploaded_video_num == 1 and selected_mode == "Video": + msgs = [{"role": "user", "content": frames + [user_text]}] + # Set decode params for video + params["max_inp_length"] = 4352 # Set the maximum input length of the video mode + params["use_image_id"] = False # Do not use image_id + params["max_slice_nums"] = 1 # # use 1 if cuda OOM and video resolution > 448*448 + else: + content_list.append(user_text) + msgs = [{"role": "user", "content": content_list}] + + print("content_list:", content_list) # debug + print("params:", params) # debug + + # Generate and display the model's responses + response = model.chat(image=None, msgs=msgs, context=None, tokenizer=tokenizer, **params) + st.session_state.response = st.write_stream(response) + st.session_state.chat_history.append({ + "role": "model", + "content": st.session_state.response, + "image": None, + "video": None + }) + + st.divider() # Add separators to the interface From dc5f80925367515582260335e9bcecc10ba08e0a Mon Sep 17 00:00:00 2001 From: JamePeng Date: Tue, 27 Aug 2024 04:03:35 +0800 Subject: [PATCH 3/6] Update web_demo_streamlit-minicpmv2_6.py 1. Avoid using 'None' string when `user_text` is empty. 2. Added `st.spinner` to display a loading message during AI content generation. --- web_demo_streamlit-minicpmv2_6.py | 164 ++++++++++++++++-------------- 1 file changed, 85 insertions(+), 79 deletions(-) diff --git a/web_demo_streamlit-minicpmv2_6.py b/web_demo_streamlit-minicpmv2_6.py index 882450e..0fa5858 100644 --- a/web_demo_streamlit-minicpmv2_6.py +++ b/web_demo_streamlit-minicpmv2_6.py @@ -173,91 +173,97 @@ def encode_video(video_path): return frames + # User input box user_text = st.chat_input("Enter your question") -if user_text: - # Display user input and save it to session history - with st.chat_message(U_NAME, avatar="user"): - st.session_state.chat_history.append({ - "role": "user", - "content": user_text, - "image": None, - "video": None - }) - st.markdown(f"{U_NAME}: {user_text}") +if user_text is not None: + if user_text.strip() is "": + st.warning('Input message could not be empty!', icon="⚠️") + else: + # Display user input and save it to session history + with st.chat_message(U_NAME, avatar="user"): + st.session_state.chat_history.append({ + "role": "user", + "content": user_text, + "image": None, + "video": None + }) + st.markdown(f"{U_NAME}: {user_text}") - # Generate responses using the model - model = st.session_state.model - tokenizer = st.session_state.tokenizer - content_list = [] # Store the content (text or image) that will be passed into the model - imageFile = None + # Generate responses using the model + model = st.session_state.model + tokenizer = st.session_state.tokenizer + content_list = [] # Store the content (text or image) that will be passed into the model + imageFile = None - with st.chat_message(A_NAME, avatar="assistant"): - # Handle different inputs depending on the mode selected by the user - if selected_mode == "Single Image": - # Single image mode: pass in the last uploaded image - print("Single Images mode in use") - if len(st.session_state.chat_history) > 1 and len(st.session_state.uploaded_image_list) >= 1: - uploaded_image = st.session_state.uploaded_image_list[-1] - if uploaded_image: - imageFile = Image.open(uploaded_image).convert('RGB') - content_list.append(imageFile) + with st.chat_message(A_NAME, avatar="assistant"): + # Handle different inputs depending on the mode selected by the user + if selected_mode == "Single Image": + # Single image mode: pass in the last uploaded image + print("Single Images mode in use") + if len(st.session_state.chat_history) > 1 and len(st.session_state.uploaded_image_list) >= 1: + uploaded_image = st.session_state.uploaded_image_list[-1] + if uploaded_image: + imageFile = Image.open(uploaded_image).convert('RGB') + content_list.append(imageFile) + else: + print("Single Images mode: No image found") + + elif selected_mode == "Multiple Images": + # Multi-image mode: pass in all the images uploaded last time + print("Multiple Images mode in use") + if len(st.session_state.chat_history) > 1 and st.session_state.uploaded_image_num >= 1: + for uploaded_image in st.session_state.uploaded_image_list: + imageFile = Image.open(uploaded_image).convert('RGB') + content_list.append(imageFile) + else: + print("Multiple Images mode: No image found") + + elif selected_mode == "Video": + # Video mode: pass in slice frames of uploaded video + print("Video mode in use") + if len(st.session_state.chat_history) > 1 and st.session_state.uploaded_video_num == 1: + uploaded_video_path = st.session_state.uploaded_video_list[-1] + if uploaded_video_path: + frames = encode_video(uploaded_video_path) + else: + print("Video Mode: No video found") + + # Defining model parameters + params = { + 'sampling': True, + 'top_p': top_p, + 'top_k': top_k, + 'temperature': temperature, + 'repetition_penalty': repetition_penalty, + "max_new_tokens": max_length, + "stream": True + } + + # Set different input parameters depending on whether to upload a video + if st.session_state.uploaded_video_num == 1 and selected_mode == "Video": + msgs = [{"role": "user", "content": frames + [user_text]}] + # Set decode params for video + params["max_inp_length"] = 4352 # Set the maximum input length of the video mode + params["use_image_id"] = False # Do not use image_id + params["max_slice_nums"] = 1 # # use 1 if cuda OOM and video resolution > 448*448 else: - print("Single Images mode: No image found") + content_list.append(user_text) + msgs = [{"role": "user", "content": content_list}] - elif selected_mode == "Multiple Images": - # Multi-image mode: pass in all the images uploaded last time - print("Multiple Images mode in use") - if len(st.session_state.chat_history) > 1 and st.session_state.uploaded_image_num >= 1: - for uploaded_image in st.session_state.uploaded_image_list: - imageFile = Image.open(uploaded_image).convert('RGB') - content_list.append(imageFile) - else: - print("Multiple Images mode: No image found") + print("content_list:", content_list) # debug + print("params:", params) # debug - elif selected_mode == "Video": - # Video mode: pass in slice frames of uploaded video - print("Video mode in use") - if len(st.session_state.chat_history) > 1 and st.session_state.uploaded_video_num == 1: - uploaded_video_path = st.session_state.uploaded_video_list[-1] - if uploaded_video_path: - frames = encode_video(uploaded_video_path) - else: - print("Video Mode: No video found") + # Generate and display the model's responses + with st.spinner('AI is thinking...'): + response = model.chat(image=None, msgs=msgs, context=None, tokenizer=tokenizer, **params) + st.session_state.response = st.write_stream(response) + st.session_state.chat_history.append({ + "role": "model", + "content": st.session_state.response, + "image": None, + "video": None + }) - # Defining model parameters - params = { - 'sampling': True, - 'top_p': top_p, - 'top_k': top_k, - 'temperature': temperature, - 'repetition_penalty': repetition_penalty, - "max_new_tokens": max_length, - "stream": True - } + st.divider() # Add separators to the interface - # Set different input parameters depending on whether to upload a video - if st.session_state.uploaded_video_num == 1 and selected_mode == "Video": - msgs = [{"role": "user", "content": frames + [user_text]}] - # Set decode params for video - params["max_inp_length"] = 4352 # Set the maximum input length of the video mode - params["use_image_id"] = False # Do not use image_id - params["max_slice_nums"] = 1 # # use 1 if cuda OOM and video resolution > 448*448 - else: - content_list.append(user_text) - msgs = [{"role": "user", "content": content_list}] - - print("content_list:", content_list) # debug - print("params:", params) # debug - - # Generate and display the model's responses - response = model.chat(image=None, msgs=msgs, context=None, tokenizer=tokenizer, **params) - st.session_state.response = st.write_stream(response) - st.session_state.chat_history.append({ - "role": "model", - "content": st.session_state.response, - "image": None, - "video": None - }) - - st.divider() # Add separators to the interface From 3745c3316a4064439f94444e44242c581028a912 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Tue, 27 Aug 2024 04:20:48 +0800 Subject: [PATCH 4/6] Update web_demo_streamlit-minicpmv2_6.py --- web_demo_streamlit-minicpmv2_6.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/web_demo_streamlit-minicpmv2_6.py b/web_demo_streamlit-minicpmv2_6.py index 0fa5858..1b00a86 100644 --- a/web_demo_streamlit-minicpmv2_6.py +++ b/web_demo_streamlit-minicpmv2_6.py @@ -225,7 +225,8 @@ if user_text is not None: if len(st.session_state.chat_history) > 1 and st.session_state.uploaded_video_num == 1: uploaded_video_path = st.session_state.uploaded_video_list[-1] if uploaded_video_path: - frames = encode_video(uploaded_video_path) + with st.spinner('Encoding your video, please wait...'): + frames = encode_video(uploaded_video_path) else: print("Video Mode: No video found") From 344ddc2cb1bc5cb246590d6a1372323d13c10c10 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Tue, 27 Aug 2024 04:40:48 +0800 Subject: [PATCH 5/6] Optimize video frame sampling logic - Replaced manual index calculation with `np.linspace` for improved efficiency and readability. - Reduced computation overhead by utilizing NumPy's vectorized operations for generating evenly spaced frame indices. --- web_demo_streamlit-minicpmv2_6.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/web_demo_streamlit-minicpmv2_6.py b/web_demo_streamlit-minicpmv2_6.py index 1b00a86..6cdedd2 100644 --- a/web_demo_streamlit-minicpmv2_6.py +++ b/web_demo_streamlit-minicpmv2_6.py @@ -151,7 +151,7 @@ def encode_video(video_path): def uniform_sample(frame_indices, num_samples): # Calculate sampling interval and uniformly sample frame indices gap = len(frame_indices) / num_samples - sampled_idxs = [int(i * gap + gap / 2) for i in range(num_samples)] + sampled_idxs = np.linspace(gap / 2, len(frame_indices) - gap / 2, num_samples, dtype=int) return [frame_indices[i] for i in sampled_idxs] # Read the video and set the decoder's context to CPU From b109c67478d73baf337c13a5cbeede18a584f5b6 Mon Sep 17 00:00:00 2001 From: JamePeng Date: Tue, 27 Aug 2024 20:52:52 +0800 Subject: [PATCH 6/6] fixed missing import --- web_demo_streamlit-minicpmv2_6.py | 1 + 1 file changed, 1 insertion(+) diff --git a/web_demo_streamlit-minicpmv2_6.py b/web_demo_streamlit-minicpmv2_6.py index 6cdedd2..0ad8be5 100644 --- a/web_demo_streamlit-minicpmv2_6.py +++ b/web_demo_streamlit-minicpmv2_6.py @@ -4,6 +4,7 @@ import streamlit as st import torch from PIL import Image from decord import VideoReader, cpu +import numpy as np from transformers import AutoModel, AutoTokenizer # Model path