Add code

2026-02-05 01:49:23 +08:00 · 2024-09-26 12:38:42 -04:00
parent 11c828edb5
commit d5f5db5f9b
10 changed files with 626 additions and 126 deletions
--- a/demo/app.py
+++ b/demo/app.py
@@ -1,88 +1,42 @@
 import gradio as gr
 import cv2
-import numpy as np
+from huggingface_hub import hf_hub_download
 from gradio_webrtc import WebRTC
-from pathlib import Path
 from twilio.rest import Client
 import os
+from inference import YOLOv10
+
+model_file = hf_hub_download(
+    repo_id="onnx-community/yolov10n", filename="onnx/model.onnx"
+)
+
+model = YOLOv10(model_file)

 account_sid = os.environ.get("TWILIO_ACCOUNT_SID")
 auth_token = os.environ.get("TWILIO_AUTH_TOKEN")
-client = Client(account_sid, auth_token)

-token = client.tokens.create()
+if account_sid and auth_token:
+    client = Client(account_sid, auth_token)

-rtc_configuration = {
-    "iceServers": token.ice_servers,
-    "iceTransportPolicy": "relay",
-}
+    token = client.tokens.create()

-CLASSES = [
-    "background",
-    "aeroplane",
-    "bicycle",
-    "bird",
-    "boat",
-    "bottle",
-    "bus",
-    "car",
-    "cat",
-    "chair",
-    "cow",
-    "diningtable",
-    "dog",
-    "horse",
-    "motorbike",
-    "person",
-    "pottedplant",
-    "sheep",
-    "sofa",
-    "train",
-    "tvmonitor",
-]
-COLORS = np.random.uniform(0, 255, size=(len(CLASSES), 3))
+    rtc_configuration = {
+        "iceServers": token.ice_servers,
+        "iceTransportPolicy": "relay",
+    }
+else:
+    rtc_configuration = None

-directory = Path(__file__).parent
-
-MODEL = str((directory / "MobileNetSSD_deploy.caffemodel").resolve())
-PROTOTXT = str((directory / "MobileNetSSD_deploy.prototxt.txt").resolve())
-net = cv2.dnn.readNetFromCaffe(PROTOTXT, MODEL)

+rtc_configuration = None

 def detection(image, conf_threshold=0.3):
-
-    blob = cv2.dnn.blobFromImage(
-        cv2.resize(image, (300, 300)), 0.007843, (300, 300), 127.5
-    )
-    net.setInput(blob)
-
-    detections = net.forward()
-    image = cv2.resize(image, (500, 500))
-    (h, w) = image.shape[:2]
-    labels = []
-    for i in np.arange(0, detections.shape[2]):
-        confidence = detections[0, 0, i, 2]
-
-        if confidence > conf_threshold:
-            # extract the index of the class label from the `detections`,
-            # then compute the (x, y)-coordinates of the bounding box for
-            # the object
-            idx = int(detections[0, 0, i, 1])
-            box = detections[0, 0, i, 3:7] * np.array([w, h, w, h])
-            (startX, startY, endX, endY) = box.astype("int")
-
-            # display the prediction
-            label = f"{CLASSES[idx]}: {round(confidence * 100, 2)}%"
-            labels.append(label)
-            cv2.rectangle(image, (startX, startY), (endX, endY), COLORS[idx], 2)
-            y = startY - 15 if startY - 15 > 15 else startY + 15
-            cv2.putText(
-                image, label, (startX, y), cv2.FONT_HERSHEY_SIMPLEX, 0.5, COLORS[idx], 2
-            )
-    return image
+    image = cv2.resize(image, (model.input_width, model.input_height))
+    new_image = model.detect_objects(image, conf_threshold)
+    return cv2.resize(new_image, (500, 500))


-css=""".my-group {max-width: 600px !important; max-height: 600 !important;}
+css = """.my-group {max-width: 600px !important; max-height: 600 !important;}
                      .my-column {display: flex !important; justify-content: center !important; align-items: center !important};"""


@@ -90,18 +44,20 @@ with gr.Blocks(css=css) as demo:
    gr.HTML(
        """
    <h1 style='text-align: center'>
-    YOLOv10 Webcam Stream
+    YOLOv10 Webcam Stream (Powered by WebRTC ⚡️)
    </h1>
-    """)
+    """
+    )
    gr.HTML(
        """
        <h3 style='text-align: center'>
        <a href='https://arxiv.org/abs/2405.14458' target='_blank'>arXiv</a> | <a href='https://github.com/THU-MIG/yolov10' target='_blank'>github</a>
        </h3>
-        """)
+        """
+    )
    with gr.Column(elem_classes=["my-column"]):
        with gr.Group(elem_classes=["my-group"]):
-            image = WebRTC(label="Strean", rtc_configuration=rtc_configuration)
+            image = WebRTC(label="Stream", rtc_configuration=rtc_configuration)
            conf_threshold = gr.Slider(
                label="Confidence Threshold",
                minimum=0.0,
@@ -109,13 +65,10 @@ with gr.Blocks(css=css) as demo:
                step=0.05,
                value=0.30,
            )
-        
-        image.webrtc_stream(
-            fn=detection,
-            inputs=[image],
-            stream_every=0.05,
-            time_limit=30
+
+        image.stream(
+            fn=detection, inputs=[image, conf_threshold], outputs=[image], time_limit=10
        )

-if __name__ == '__main__':
+if __name__ == "__main__":
    demo.launch()
--- a/demo/draw_boxes.py
+++ b/demo/draw_boxes.py
@@ -25,7 +25,7 @@ def draw_bounding_boxes(image, results: dict, model, threshold=0.3):
            color = get_color(label)

            # Draw bounding box
-            draw.rectangle(box, outline=color, width=3) # type: ignore
+            draw.rectangle(box, outline=color, width=3)  # type: ignore

            # Prepare text
            text = f"{label}: {score:.2f}"
@@ -35,8 +35,8 @@ def draw_bounding_boxes(image, results: dict, model, threshold=0.3):

            # Draw text background
            draw.rectangle(
-                [box[0], box[1] - text_height - 4, box[0] + text_width, box[1]], # type: ignore
-                fill=color, # type: ignore
+                [box[0], box[1] - text_height - 4, box[0] + text_width, box[1]],  # type: ignore
+                fill=color,  # type: ignore
            )

            # Draw text
--- a/demo/inference.py
+++ b/demo/inference.py
@@ -0,0 +1,146 @@
+import time
+import cv2
+import numpy as np
+import onnxruntime
+
+from utils import draw_detections
+
+
+class YOLOv10:
+    def __init__(self, path):
+
+        # Initialize model
+        self.initialize_model(path)
+
+    def __call__(self, image):
+        return self.detect_objects(image)
+
+    def initialize_model(self, path):
+        self.session = onnxruntime.InferenceSession(
+            path, providers=onnxruntime.get_available_providers()
+        )
+        # Get model info
+        self.get_input_details()
+        self.get_output_details()
+
+    def detect_objects(self, image, conf_threshold=0.3):
+        input_tensor = self.prepare_input(image)
+
+        # Perform inference on the image
+        new_image = self.inference(image, input_tensor, conf_threshold)
+
+        return new_image
+
+    def prepare_input(self, image):
+        self.img_height, self.img_width = image.shape[:2]
+
+        input_img = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+
+        # Resize input image
+        input_img = cv2.resize(input_img, (self.input_width, self.input_height))
+
+        # Scale input pixel values to 0 to 1
+        input_img = input_img / 255.0
+        input_img = input_img.transpose(2, 0, 1)
+        input_tensor = input_img[np.newaxis, :, :, :].astype(np.float32)
+
+        return input_tensor
+
+    def inference(self, image, input_tensor, conf_threshold=0.3):
+        start = time.perf_counter()
+        outputs = self.session.run(
+            self.output_names, {self.input_names[0]: input_tensor}
+        )
+
+        print(f"Inference time: {(time.perf_counter() - start)*1000:.2f} ms")
+        boxes, scores, class_ids, = self.process_output(outputs, conf_threshold)
+        return self.draw_detections(image, boxes, scores, class_ids)
+
+    def process_output(self, output, conf_threshold=0.3):
+        predictions = np.squeeze(output[0])
+
+        # Filter out object confidence scores below threshold
+        scores = predictions[:, 4]
+        predictions = predictions[scores > conf_threshold, :]
+        scores = scores[scores > conf_threshold]
+
+        if len(scores) == 0:
+            return [], [], []
+
+        # Get the class with the highest confidence
+        class_ids = np.argmax(predictions[:, 4:], axis=1)
+
+        # Get bounding boxes for each object
+        boxes = self.extract_boxes(predictions)
+
+        return boxes, scores, class_ids
+
+    def extract_boxes(self, predictions):
+        # Extract boxes from predictions
+        boxes = predictions[:, :4]
+
+        # Scale boxes to original image dimensions
+        boxes = self.rescale_boxes(boxes)
+
+        # Convert boxes to xyxy format
+        #boxes = xywh2xyxy(boxes)
+
+        return boxes
+
+    def rescale_boxes(self, boxes):
+        # Rescale boxes to original image dimensions
+        input_shape = np.array(
+            [self.input_width, self.input_height, self.input_width, self.input_height]
+        )
+        boxes = np.divide(boxes, input_shape, dtype=np.float32)
+        boxes *= np.array(
+            [self.img_width, self.img_height, self.img_width, self.img_height]
+        )
+        return boxes
+
+    def draw_detections(self, image, boxes, scores, class_ids, draw_scores=True, mask_alpha=0.4):
+        return draw_detections(
+            image, boxes, scores, class_ids, mask_alpha
+        )
+
+    def get_input_details(self):
+        model_inputs = self.session.get_inputs()
+        self.input_names = [model_inputs[i].name for i in range(len(model_inputs))]
+
+        self.input_shape = model_inputs[0].shape
+        self.input_height = self.input_shape[2]
+        self.input_width = self.input_shape[3]
+
+    def get_output_details(self):
+        model_outputs = self.session.get_outputs()
+        self.output_names = [model_outputs[i].name for i in range(len(model_outputs))]
+
+
+if __name__ == "__main__":
+    import requests
+    import tempfile
+    from huggingface_hub import hf_hub_download
+
+    model_file = hf_hub_download(
+        repo_id="onnx-community/yolov10s", filename="onnx/model.onnx"
+    )
+
+    yolov8_detector = YOLOv10(model_file)
+
+    with tempfile.NamedTemporaryFile(suffix=".jpg", delete=False) as f:
+        f.write(
+            requests.get(
+                "https://live.staticflickr.com/13/19041780_d6fd803de0_3k.jpg"
+            ).content
+        )
+        f.seek(0)
+        img = cv2.imread(f.name)
+
+    # # Detect Objects
+    combined_image = yolov8_detector.detect_objects(img)
+
+
+    # Draw detections
+    cv2.namedWindow("Output", cv2.WINDOW_NORMAL)
+    cv2.imshow("Output", combined_image)
+    cv2.waitKey(0)
--- a/demo/requirements.txt
+++ b/demo/requirements.txt
@@ -2,4 +2,5 @@ safetensors==0.4.3
 opencv-python
 twilio
 https://huggingface.co/datasets/freddyaboulton/bucket/resolve/main/gradio-5.0.0b3-py3-none-any.whl
-https://huggingface.co/datasets/freddyaboulton/bucket/resolve/main/gradio_webrtc-0.0.1-py3-none-any.whl
+https://huggingface.co/datasets/freddyaboulton/bucket/resolve/main/gradio_webrtc-0.0.1-py3-none-any.whl
+onnxruntime-gpu
--- a/demo/space.py
+++ b/demo/space.py
--- a/demo/utils.py
+++ b/demo/utils.py
@@ -0,0 +1,237 @@
+import numpy as np
+import cv2
+
+class_names = [
+    "person",
+    "bicycle",
+    "car",
+    "motorcycle",
+    "airplane",
+    "bus",
+    "train",
+    "truck",
+    "boat",
+    "traffic light",
+    "fire hydrant",
+    "stop sign",
+    "parking meter",
+    "bench",
+    "bird",
+    "cat",
+    "dog",
+    "horse",
+    "sheep",
+    "cow",
+    "elephant",
+    "bear",
+    "zebra",
+    "giraffe",
+    "backpack",
+    "umbrella",
+    "handbag",
+    "tie",
+    "suitcase",
+    "frisbee",
+    "skis",
+    "snowboard",
+    "sports ball",
+    "kite",
+    "baseball bat",
+    "baseball glove",
+    "skateboard",
+    "surfboard",
+    "tennis racket",
+    "bottle",
+    "wine glass",
+    "cup",
+    "fork",
+    "knife",
+    "spoon",
+    "bowl",
+    "banana",
+    "apple",
+    "sandwich",
+    "orange",
+    "broccoli",
+    "carrot",
+    "hot dog",
+    "pizza",
+    "donut",
+    "cake",
+    "chair",
+    "couch",
+    "potted plant",
+    "bed",
+    "dining table",
+    "toilet",
+    "tv",
+    "laptop",
+    "mouse",
+    "remote",
+    "keyboard",
+    "cell phone",
+    "microwave",
+    "oven",
+    "toaster",
+    "sink",
+    "refrigerator",
+    "book",
+    "clock",
+    "vase",
+    "scissors",
+    "teddy bear",
+    "hair drier",
+    "toothbrush",
+]
+
+# Create a list of colors for each class where each color is a tuple of 3 integer values
+rng = np.random.default_rng(3)
+colors = rng.uniform(0, 255, size=(len(class_names), 3))
+
+
+def nms(boxes, scores, iou_threshold):
+    # Sort by score
+    sorted_indices = np.argsort(scores)[::-1]
+
+    keep_boxes = []
+    while sorted_indices.size > 0:
+        # Pick the last box
+        box_id = sorted_indices[0]
+        keep_boxes.append(box_id)
+
+        # Compute IoU of the picked box with the rest
+        ious = compute_iou(boxes[box_id, :], boxes[sorted_indices[1:], :])
+
+        # Remove boxes with IoU over the threshold
+        keep_indices = np.where(ious < iou_threshold)[0]
+
+        # print(keep_indices.shape, sorted_indices.shape)
+        sorted_indices = sorted_indices[keep_indices + 1]
+
+    return keep_boxes
+
+
+def multiclass_nms(boxes, scores, class_ids, iou_threshold):
+    unique_class_ids = np.unique(class_ids)
+
+    keep_boxes = []
+    for class_id in unique_class_ids:
+        class_indices = np.where(class_ids == class_id)[0]
+        class_boxes = boxes[class_indices, :]
+        class_scores = scores[class_indices]
+
+        class_keep_boxes = nms(class_boxes, class_scores, iou_threshold)
+        keep_boxes.extend(class_indices[class_keep_boxes])
+
+    return keep_boxes
+
+
+def compute_iou(box, boxes):
+    # Compute xmin, ymin, xmax, ymax for both boxes
+    xmin = np.maximum(box[0], boxes[:, 0])
+    ymin = np.maximum(box[1], boxes[:, 1])
+    xmax = np.minimum(box[2], boxes[:, 2])
+    ymax = np.minimum(box[3], boxes[:, 3])
+
+    # Compute intersection area
+    intersection_area = np.maximum(0, xmax - xmin) * np.maximum(0, ymax - ymin)
+
+    # Compute union area
+    box_area = (box[2] - box[0]) * (box[3] - box[1])
+    boxes_area = (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1])
+    union_area = box_area + boxes_area - intersection_area
+
+    # Compute IoU
+    iou = intersection_area / union_area
+
+    return iou
+
+
+def xywh2xyxy(x):
+    # Convert bounding box (x, y, w, h) to bounding box (x1, y1, x2, y2)
+    y = np.copy(x)
+    y[..., 0] = x[..., 0] - x[..., 2] / 2
+    y[..., 1] = x[..., 1] - x[..., 3] / 2
+    y[..., 2] = x[..., 0] + x[..., 2] / 2
+    y[..., 3] = x[..., 1] + x[..., 3] / 2
+    return y
+
+
+def draw_detections(image, boxes, scores, class_ids, mask_alpha=0.3):
+    det_img = image.copy()
+
+    img_height, img_width = image.shape[:2]
+    font_size = min([img_height, img_width]) * 0.0006
+    text_thickness = int(min([img_height, img_width]) * 0.001)
+
+    #det_img = draw_masks(det_img, boxes, class_ids, mask_alpha)
+
+    # Draw bounding boxes and labels of detections
+    for class_id, box, score in zip(class_ids, boxes, scores):
+        color = colors[class_id]
+
+        draw_box(det_img, box, color)
+
+        label = class_names[class_id]
+        caption = f"{label} {int(score * 100)}%"
+        draw_text(det_img, caption, box, color, font_size, text_thickness)
+
+    return det_img
+
+
+def draw_box(
+    image: np.ndarray,
+    box: np.ndarray,
+    color: tuple[int, int, int] = (0, 0, 255),
+    thickness: int = 2,
+) -> np.ndarray:
+    x1, y1, x2, y2 = box.astype(int)
+    return cv2.rectangle(image, (x1, y1), (x2, y2), color, thickness)
+
+
+def draw_text(
+    image: np.ndarray,
+    text: str,
+    box: np.ndarray,
+    color: tuple[int, int, int] = (0, 0, 255),
+    font_size: float = 0.001,
+    text_thickness: int = 2,
+) -> np.ndarray:
+    x1, y1, x2, y2 = box.astype(int)
+    (tw, th), _ = cv2.getTextSize(
+        text=text,
+        fontFace=cv2.FONT_HERSHEY_SIMPLEX,
+        fontScale=font_size,
+        thickness=text_thickness,
+    )
+    th = int(th * 1.2)
+
+    cv2.rectangle(image, (x1, y1), (x1 + tw, y1 - th), color, -1)
+
+    return cv2.putText(
+        image,
+        text,
+        (x1, y1),
+        cv2.FONT_HERSHEY_SIMPLEX,
+        font_size,
+        (255, 255, 255),
+        text_thickness,
+        cv2.LINE_AA,
+    )
+
+
+def draw_masks(
+    image: np.ndarray, boxes: np.ndarray, classes: np.ndarray, mask_alpha: float = 0.3
+) -> np.ndarray:
+    mask_img = image.copy()
+
+    # Draw bounding boxes and labels of detections
+    for box, class_id in zip(boxes, classes):
+        color = colors[class_id]
+
+        x1, y1, x2, y2 = box.astype(int)
+
+        # Draw fill rectangle in mask image
+        cv2.rectangle(mask_img, (x1, y1), (x2, y2), color, -1)
+
+    return cv2.addWeighted(mask_img, mask_alpha, image, 1 - mask_alpha, 0)