add demo code (leoxiaobin#161)

alex9311 · web-flow · commit e00377a256e1 · 2020-02-01T16:37:21.000+08:00
* add demo code

* demo code review update: use BGR for pose model

* demo code review update 2
diff --git a/demo/.gitignore b/demo/.gitignore
@@ -0,0 +1,3 @@
+output
+models
+videos
diff --git a/demo/Dockerfile b/demo/Dockerfile
@@ -0,0 +1,112 @@
+FROM nvidia/cuda:10.2-cudnn7-devel-ubuntu16.04
+
+ENV OPENCV_VERSION="3.4.6"
+
+# Basic toolchain
+RUN apt-get update && apt-get install -y \
+        apt-utils \
+        build-essential \
+        git \
+        wget \
+        unzip \
+        yasm \
+        pkg-config \
+        libcurl4-openssl-dev \
+        zlib1g-dev \
+        htop \
+        cmake \
+        nano \
+        python3-pip \
+        python3-dev \
+        python3-tk \
+        libx264-dev \
+    && cd /usr/local/bin \
+    && ln -s /usr/bin/python3 python \
+    && pip3 install --upgrade pip \
+    && apt-get autoremove -y
+
+# Getting OpenCV dependencies available with apt
+RUN apt-get update && apt-get install -y \
+        libeigen3-dev \
+        libjpeg-dev \
+        libpng-dev \
+        libtiff-dev \
+        libjasper-dev \
+        libswscale-dev \
+        libavcodec-dev \
+        libavformat-dev && \
+    apt-get autoremove -y
+
+# Getting other dependencies
+RUN apt-get update && apt-get install -y \
+        cppcheck \
+        graphviz \
+        doxygen \
+        p7zip-full \
+        libdlib18 \
+        libdlib-dev && \
+    apt-get autoremove -y
+
+
+# Install OpenCV + OpenCV contrib (takes forever)
+RUN mkdir -p /tmp && \
+    cd /tmp && \
+    wget --no-check-certificate -O opencv.zip https://github.com/opencv/opencv/archive/${OPENCV_VERSION}.zip && \
+    wget --no-check-certificate -O opencv_contrib.zip https://github.com/opencv/opencv_contrib/archive/${OPENCV_VERSION}.zip && \
+    unzip opencv.zip && \
+    unzip opencv_contrib.zip && \
+    mkdir opencv-${OPENCV_VERSION}/build && \
+    cd opencv-${OPENCV_VERSION}/build && \
+    cmake -D CMAKE_BUILD_TYPE=RELEASE \
+        -D CMAKE_INSTALL_PREFIX=/usr/local \
+        -D WITH_CUDA=ON \
+        -D CUDA_FAST_MATH=1 \
+        -D WITH_CUBLAS=1 \
+        -D WITH_FFMPEG=ON \
+        -D WITH_OPENCL=ON \
+        -D WITH_V4L=ON \
+        -D WITH_OPENGL=ON \
+        -D OPENCV_EXTRA_MODULES_PATH=/tmp/opencv_contrib-${OPENCV_VERSION}/modules \
+        .. && \
+    make -j$(nproc) && \
+    make install && \
+    echo "/usr/local/lib" > /etc/ld.so.conf.d/opencv.conf && \
+    ldconfig && \
+    cd /tmp && \
+    rm -rf opencv-${OPENCV_VERSION} opencv.zip opencv_contrib-${OPENCV_VERSION} opencv_contrib.zip && \
+    cd /
+
+# Compile and install ffmpeg from source
+RUN git clone https://github.com/FFmpeg/FFmpeg /root/ffmpeg && \
+    cd /root/ffmpeg && \
+    ./configure --enable-gpl --enable-libx264 --enable-nonfree --disable-shared --extra-cflags=-I/usr/local/include && \
+    make -j8 && make install -j8
+
+# clone deep-high-resolution-net
+ARG POSE_ROOT=/pose_root
+RUN git clone https://github.com/leoxiaobin/deep-high-resolution-net.pytorch.git $POSE_ROOT
+WORKDIR $POSE_ROOT
+RUN mkdir output && mkdir log
+
+RUN pip3 install -r requirements.txt && \
+    pip3 install torch==1.1.0 \
+    torchvision==0.3.0 \
+    opencv-python \
+    pillow==6.2.1
+
+# build deep-high-resolution-net lib
+WORKDIR $POSE_ROOT/lib
+RUN make
+
+# install COCO API
+ARG COCOAPI=/cocoapi
+RUN git clone https://github.com/cocodataset/cocoapi.git $COCOAPI
+WORKDIR $COCOAPI/PythonAPI
+# Install into global site-packages
+RUN make install
+
+# download fastrrnn pretrained model for person detection
+RUN python -c "import torchvision; model = torchvision.models.detection.fasterrcnn_resnet50_fpn(pretrained=True); model.eval()"
+
+COPY inference.py $POSE_ROOT/tools
+COPY inference-config.yaml $POSE_ROOT/
diff --git a/demo/README.md b/demo/README.md
@@ -0,0 +1,41 @@
+This demo code is meant to be run on a video and includes a person detector.
+[Nvidia-docker](https://github.com/NVIDIA/nvidia-docker) and GPUs are required.
+It only expects there to be one person in each frame of video, though the code could easily be extended to support multiple people.
+
+### Prep
+1. Download the researchers' pretrained pose estimator from [google drive](https://drive.google.com/drive/folders/1hOTihvbyIxsm5ygDpbUuJ7O_tzv4oXjC?usp=sharing) to this directory under `models/`
+2. Put the video file you'd like to infer on in this directory under `videos`
+3. build the docker container in this directory with `./build-docker.sh` (this can take time because it involves compiling opencv)
+4. update the `inference-config.yaml` file to reflect the number of GPUs you have available
+
+### Running the Model
+Start your docker container with:
+```
+nvidia-docker run --rm -it \
+  -v $(pwd)/output:/output \
+  -v $(pwd)/videos:/videos \
+  -v $(pwd)/models:/models \
+  -w /pose_root \
+  hrnet_demo_inference \
+  /bin/bash
+```
+
+Once the container is running, you can run inference with:
+```
+python tools/inference.py \
+  --cfg inference-config.yaml \
+  --videoFile /videos/my-video.mp4 \
+  --inferenceFps 10 \
+  --writeBoxFrames \
+  TEST.MODEL_FILE \
+  /models/pytorch/pose_coco/pose_hrnet_w32_384x288.pth
+```
+
+The command above will output frames with boxes,
+frames with poses,
+a video with poses,
+and a csv with the keypoint coordinates for each frame.
+
+![](hrnet-demo.gif)
+
+Original source for demo video above is licensed for `Free for commercial use No attribution required` by [Pixabay](https://pixabay.com/service/license/)
diff --git a/demo/build-docker.sh b/demo/build-docker.sh
@@ -0,0 +1 @@
+docker build -t hrnet_demo_inference .
diff --git a/demo/hrnet-demo.gif b/demo/hrnet-demo.gif
diff --git a/demo/inference-config.yaml b/demo/inference-config.yaml
@@ -0,0 +1,127 @@
+AUTO_RESUME: true
+CUDNN:
+  BENCHMARK: true
+  DETERMINISTIC: false
+  ENABLED: true
+DATA_DIR: ''
+GPUS: (0,)
+OUTPUT_DIR: 'output'
+LOG_DIR: 'log'
+WORKERS: 24
+PRINT_FREQ: 100
+
+DATASET:
+  COLOR_RGB: true
+  DATASET: 'coco'
+  DATA_FORMAT: jpg
+  FLIP: true
+  NUM_JOINTS_HALF_BODY: 8
+  PROB_HALF_BODY: 0.3
+  ROOT: 'data/coco/'
+  ROT_FACTOR: 45
+  SCALE_FACTOR: 0.35
+  TEST_SET: 'val2017'
+  TRAIN_SET: 'train2017'
+MODEL:
+  INIT_WEIGHTS: true
+  NAME: pose_hrnet
+  NUM_JOINTS: 17
+  PRETRAINED: 'models/pytorch/imagenet/hrnet_w32-36af842e.pth'
+  TARGET_TYPE: gaussian
+  IMAGE_SIZE:
+  - 288
+  - 384
+  HEATMAP_SIZE:
+  - 72
+  - 96
+  SIGMA: 3
+  EXTRA:
+    PRETRAINED_LAYERS:
+    - 'conv1'
+    - 'bn1'
+    - 'conv2'
+    - 'bn2'
+    - 'layer1'
+    - 'transition1'
+    - 'stage2'
+    - 'transition2'
+    - 'stage3'
+    - 'transition3'
+    - 'stage4'
+    FINAL_CONV_KERNEL: 1
+    STAGE2:
+      NUM_MODULES: 1
+      NUM_BRANCHES: 2
+      BLOCK: BASIC
+      NUM_BLOCKS:
+      - 4
+      - 4
+      NUM_CHANNELS:
+      - 32
+      - 64
+      FUSE_METHOD: SUM
+    STAGE3:
+      NUM_MODULES: 4
+      NUM_BRANCHES: 3
+      BLOCK: BASIC
+      NUM_BLOCKS:
+      - 4
+      - 4
+      - 4
+      NUM_CHANNELS:
+      - 32
+      - 64
+      - 128
+      FUSE_METHOD: SUM
+    STAGE4:
+      NUM_MODULES: 3
+      NUM_BRANCHES: 4
+      BLOCK: BASIC
+      NUM_BLOCKS:
+      - 4
+      - 4
+      - 4
+      - 4
+      NUM_CHANNELS:
+      - 32
+      - 64
+      - 128
+      - 256
+      FUSE_METHOD: SUM
+LOSS:
+  USE_TARGET_WEIGHT: true
+TRAIN:
+  BATCH_SIZE_PER_GPU: 32
+  SHUFFLE: true
+  BEGIN_EPOCH: 0
+  END_EPOCH: 210
+  OPTIMIZER: adam
+  LR: 0.001
+  LR_FACTOR: 0.1
+  LR_STEP:
+  - 170
+  - 200
+  WD: 0.0001
+  GAMMA1: 0.99
+  GAMMA2: 0.0
+  MOMENTUM: 0.9
+  NESTEROV: false
+TEST:
+  BATCH_SIZE_PER_GPU: 32
+  COCO_BBOX_FILE: 'data/coco/person_detection_results/COCO_val2017_detections_AP_H_56_person.json'
+  BBOX_THRE: 1.0
+  IMAGE_THRE: 0.0
+  IN_VIS_THRE: 0.2
+  MODEL_FILE: ''
+  NMS_THRE: 1.0
+  OKS_THRE: 0.9
+  USE_GT_BBOX: true
+  FLIP_TEST: true
+  POST_PROCESS: true
+  SHIFT_HEATMAP: true
+DEBUG:
+  DEBUG: true
+  SAVE_BATCH_IMAGES_GT: true
+  SAVE_BATCH_IMAGES_PRED: true
+  SAVE_HEATMAPS_GT: true
+  SAVE_HEATMAPS_PRED: true
diff --git a/demo/inference.py b/demo/inference.py

-Original file line number
+Diff line change
 +from __future__ import absolute_import
 +from __future__ import division
 +from __future__ import print_function
++
 +import argparse
 +import csv
 +import os
 +import shutil
++
 +from PIL import Image
 +import torch
 +import torch.nn.parallel
 +import torch.backends.cudnn as cudnn
 +import torch.optim
 +import torch.utils.data
 +import torch.utils.data.distributed
 +import torchvision.transforms as transforms
 +import torchvision
 +import cv2
 +import numpy as np
++
++
 +import _init_paths
 +import models
 +from config import cfg
 +from config import update_config
 +from core.function import get_final_preds
 +from utils.transforms import get_affine_transform
++
 +COCO_KEYPOINT_INDEXES = {
 +    0: 'nose',
 +    1: 'left_eye',
 +    2: 'right_eye',
 +    3: 'left_ear',
 +    4: 'right_ear',
 +    5: 'left_shoulder',
 +    6: 'right_shoulder',
 +    7: 'left_elbow',
 +    8: 'right_elbow',
 +    9: 'left_wrist',
 +    10: 'right_wrist',
 +    11: 'left_hip',
 +    12: 'right_hip',
 +    13: 'left_knee',
 +    14: 'right_knee',
 +    15: 'left_ankle',
 +    16: 'right_ankle'
 +}
++
 +COCO_INSTANCE_CATEGORY_NAMES = [
 +    '__background__', 'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus',
 +    'train', 'truck', 'boat', 'traffic light', 'fire hydrant', 'N/A', 'stop sign',
 +    'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow',
 +    'elephant', 'bear', 'zebra', 'giraffe', 'N/A', 'backpack', 'umbrella', 'N/A', 'N/A',
 +    'handbag', 'tie', 'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball',
 +    'kite', 'baseball bat', 'baseball glove', 'skateboard', 'surfboard', 'tennis racket',
 +    'bottle', 'N/A', 'wine glass', 'cup', 'fork', 'knife', 'spoon', 'bowl',
 +    'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza',
 +    'donut', 'cake', 'chair', 'couch', 'potted plant', 'bed', 'N/A', 'dining table',
 +    'N/A', 'N/A', 'toilet', 'N/A', 'tv', 'laptop', 'mouse', 'remote', 'keyboard', 'cell phone',
 +    'microwave', 'oven', 'toaster', 'sink', 'refrigerator', 'N/A', 'book',
 +    'clock', 'vase', 'scissors', 'teddy bear', 'hair drier', 'toothbrush'
 +]
++
++
 +def get_person_detection_boxes(model, img, threshold=0.5):
 +    pil_image = Image.fromarray(img)  # Load the image
 +    transform = transforms.Compose([transforms.ToTensor()])  # Defing PyTorch Transform
 +    transformed_img = transform(pil_image)  # Apply the transform to the image
 +    pred = model([transformed_img])  # Pass the image to the model
 +    pred_classes = [COCO_INSTANCE_CATEGORY_NAMES[i]
 +                    for i in list(pred[0]['labels'].numpy())]  # Get the Prediction Score
 +    pred_boxes = [[(i[0], i[1]), (i[2], i[3])]
 +                  for i in list(pred[0]['boxes'].detach().numpy())]  # Bounding boxes
 +    pred_score = list(pred[0]['scores'].detach().numpy())
 +    if not pred_score:
 +        return []
 +    # Get list of index with score greater than threshold
 +    pred_t = [pred_score.index(x) for x in pred_score if x > threshold][-1]
 +    pred_boxes = pred_boxes[:pred_t+1]
 +    pred_classes = pred_classes[:pred_t+1]
++
 +    person_boxes = []
 +    for idx, box in enumerate(pred_boxes):
 +        if pred_classes[idx] == 'person':
 +            person_boxes.append(box)
++
 +    return person_boxes
++
++
 +def get_pose_estimation_prediction(pose_model, image, center, scale):
 +    rotation = 0
++
 +    # pose estimation transformation
 +    trans = get_affine_transform(center, scale, rotation, cfg.MODEL.IMAGE_SIZE)
 +    model_input = cv2.warpAffine(
 +        image,
 +        trans,
 +        (int(cfg.MODEL.IMAGE_SIZE[0]), int(cfg.MODEL.IMAGE_SIZE[1])),
 +        flags=cv2.INTER_LINEAR)
 +    transform = transforms.Compose([
 +        transforms.ToTensor(),
 +        transforms.Normalize(mean=[0.485, 0.456, 0.406],
 +                             std=[0.229, 0.224, 0.225]),
 +    ])
++
 +    # pose estimation inference
 +    model_input = transform(model_input).unsqueeze(0)
 +    # switch to evaluate mode
 +    pose_model.eval()
 +    with torch.no_grad():
 +        # compute output heatmap
 +        output = pose_model(model_input)
 +        preds, _ = get_final_preds(
 +            cfg,
 +            output.clone().cpu().numpy(),
 +            np.asarray([center]),
 +            np.asarray([scale]))
++
 +        return preds
++
++
 +def box_to_center_scale(box, model_image_width, model_image_height):
 +    """convert a box to center,scale information required for pose transformation
 +    Parameters
 +    ----------
 +    box : list of tuple
 +        list of length 2 with two tuples of floats representing
 +        bottom left and top right corner of a box
 +    model_image_width : int
 +    model_image_height : int
++
 +    Returns
 +    -------
 +    (numpy array, numpy array)
 +        Two numpy arrays, coordinates for the center of the box and the scale of the box
 +    """
 +    center = np.zeros((2), dtype=np.float32)
++
 +    bottom_left_corner = box[0]
 +    top_right_corner = box[1]
 +    box_width = top_right_corner[0]-bottom_left_corner[0]
 +    box_height = top_right_corner[1]-bottom_left_corner[1]
 +    bottom_left_x = bottom_left_corner[0]
 +    bottom_left_y = bottom_left_corner[1]
 +    center[0] = bottom_left_x + box_width * 0.5
 +    center[1] = bottom_left_y + box_height * 0.5
++
 +    aspect_ratio = model_image_width * 1.0 / model_image_height
 +    pixel_std = 200
++
 +    if box_width > aspect_ratio * box_height:
 +        box_height = box_width * 1.0 / aspect_ratio
 +    elif box_width < aspect_ratio * box_height:
 +        box_width = box_height * aspect_ratio
 +    scale = np.array(
 +        [box_width * 1.0 / pixel_std, box_height * 1.0 / pixel_std],
 +        dtype=np.float32)
 +    if center[0] != -1:
 +        scale = scale * 1.25
++
 +    return center, scale
++
++
 +def prepare_output_dirs(prefix='/output/'):
 +    pose_dir = prefix+'poses/'
 +    box_dir = prefix+'boxes/'
 +    if os.path.exists(pose_dir) and os.path.isdir(pose_dir):
 +        shutil.rmtree(pose_dir)
 +    if os.path.exists(box_dir) and os.path.isdir(box_dir):
 +        shutil.rmtree(box_dir)
 +    os.makedirs(pose_dir, exist_ok=True)
 +    os.makedirs(box_dir, exist_ok=True)
 +    return pose_dir, box_dir
++
++
 +def parse_args():
 +    parser = argparse.ArgumentParser(description='Train keypoints network')
 +    # general
 +    parser.add_argument('--cfg', type=str, required=True)
 +    parser.add_argument('--videoFile', type=str, required=True)
 +    parser.add_argument('--outputDir', type=str, default='/output/')
 +    parser.add_argument('--inferenceFps', type=int, default=10)
 +    parser.add_argument('--writeBoxFrames', action='store_true')
++
 +    parser.add_argument('opts',
 +                        help='Modify config options using the command-line',
 +                        default=None,
 +                        nargs=argparse.REMAINDER)
++
 +    args = parser.parse_args()
++
 +    # args expected by supporting codebase
 +    args.modelDir = ''
 +    args.logDir = ''
 +    args.dataDir = ''
 +    args.prevModelDir = ''
 +    return args
++
++
 +def main():
 +    # cudnn related setting
 +    cudnn.benchmark = cfg.CUDNN.BENCHMARK
 +    torch.backends.cudnn.deterministic = cfg.CUDNN.DETERMINISTIC
 +    torch.backends.cudnn.enabled = cfg.CUDNN.ENABLED
++
 +    args = parse_args()
 +    update_config(cfg, args)
 +    pose_dir, box_dir = prepare_output_dirs(args.outputDir)
 +    csv_output_filename = args.outputDir+'pose-data.csv'
 +    csv_output_rows = []
++
 +    box_model = torchvision.models.detection.fasterrcnn_resnet50_fpn(pretrained=True)
 +    box_model.eval()
++
 +    pose_model = eval('models.'+cfg.MODEL.NAME+'.get_pose_net')(
 +        cfg, is_train=False
 +    )
++
 +    if cfg.TEST.MODEL_FILE:
 +        print('=> loading model from {}'.format(cfg.TEST.MODEL_FILE))
 +        pose_model.load_state_dict(torch.load(cfg.TEST.MODEL_FILE), strict=False)
 +    else:
 +        print('expected model defined in config at TEST.MODEL_FILE')
++
 +    pose_model = torch.nn.DataParallel(pose_model, device_ids=cfg.GPUS).cuda()
++
 +    # Loading an video
 +    vidcap = cv2.VideoCapture(args.videoFile)
 +    fps = vidcap.get(cv2.CAP_PROP_FPS)
 +    if fps < args.inferenceFps:
 +        print('desired inference fps is '+str(args.inferenceFps)+' but video fps is '+str(fps))
 +        exit()
 +    every_nth_frame = round(fps/args.inferenceFps)
++
 +    success, image_bgr = vidcap.read()
 +    count = 0
++
 +    while success:
 +        if count % every_nth_frame != 0:
 +            success, image_bgr = vidcap.read()
 +            count += 1
 +            continue
++
 +        image = image_bgr[:, :, [2, 1, 0]]
 +        count_str = str(count).zfill(32)
++
 +        # object detection box
 +        pred_boxes = get_person_detection_boxes(box_model, image, threshold=0.8)
 +        if args.writeBoxFrames:
 +            image_bgr_box = image_bgr.copy()
 +            for box in pred_boxes:
 +                cv2.rectangle(image_bgr_box, box[0], box[1], color=(0, 255, 0),
 +                              thickness=3)  # Draw Rectangle with the coordinates
 +            cv2.imwrite(box_dir+'box%s.jpg' % count_str, image_bgr_box)
 +        if not pred_boxes:
 +            success, image_bgr = vidcap.read()
 +            count += 1
 +            continue
++
 +        # pose estimation
 +        box = pred_boxes[0]  # assume there is only 1 person
 +        center, scale = box_to_center_scale(box, cfg.MODEL.IMAGE_SIZE[0], cfg.MODEL.IMAGE_SIZE[1])
 +        image_pose = image.copy() if cfg.DATASET.COLOR_RGB else image_bgr.copy()
 +        pose_preds = get_pose_estimation_prediction(pose_model, image_pose, center, scale)
++
 +        new_csv_row = []
 +        for _, mat in enumerate(pose_preds[0]):
 +            x_coord, y_coord = int(mat[0]), int(mat[1])
 +            cv2.circle(image_bgr, (x_coord, y_coord), 4, (255, 0, 0), 2)
 +            new_csv_row.extend([x_coord, y_coord])
++
 +        csv_output_rows.append(new_csv_row)
 +        cv2.imwrite(pose_dir+'pose%s.jpg' % count_str, image_bgr)
++
 +        # get next frame
 +        success, image_bgr = vidcap.read()
 +        count += 1
++
 +    # write csv
 +    csv_headers = ['frame']
 +    for keypoint in COCO_KEYPOINT_INDEXES.values():
 +        csv_headers.extend([keypoint+'_x', keypoint+'_y'])
++
 +    with open(csv_output_filename, 'w', newline='') as csvfile:
 +        csvwriter = csv.writer(csvfile)
 +        csvwriter.writerow(csv_headers)
 +        csvwriter.writerows(csv_output_rows)
++
 +    os.system("ffmpeg -y -r "
 +              + str(args.inferenceFps)
 +              + " -pattern_type glob -i '"
 +              + pose_dir
 +              + "/*.jpg' -c:v libx264 -vf fps="
 +              + str(args.inferenceFps)+" -pix_fmt yuv420p /output/movie.mp4")
++
++
 +if __name__ == '__main__':
 +    main()