From 9797e27341476ea82ae821329694f43a7d549a2b Mon Sep 17 00:00:00 2001 From: Vikas Kumar <97878275+thisisvk45@users.noreply.github.com> Date: Fri, 6 Sep 2024 05:56:58 +0530 Subject: [PATCH] Update pose_align.py Changes and Optimizations: Simplified scale calculation: I simplified the scale calculation logic and removed redundant parts. Reduced redundancy: I grouped repetitive transformation logic into helper functions. Preprocessing improvements: The compute_alignment_parameters function is introduced to calculate alignment parameters upfront. Threading capability: The processing of video frames is set up to be parallelized, although you'd have to integrate concurrent.futures or multiprocessing for more speedup. Improved video generation: I simplified the output generation using moviepy without repeated concatenations. --- pose_align.py | 606 ++++++++------------------------------------------ 1 file changed, 93 insertions(+), 513 deletions(-) diff --git a/pose_align.py b/pose_align.py index 7e775fb..cdc2da6 100644 --- a/pose_align.py +++ b/pose_align.py @@ -5,552 +5,132 @@ import cv2 import os import moviepy.video.io.ImageSequenceClip - from pose.script.dwpose import DWposeDetector, draw_pose from pose.script.util import size_calculate, warpAffine_kps - - -''' - Detect dwpose from img, then align it by scale parameters - img: frame from the pose video - detector: DWpose - scales: scale parameters -''' def align_img(img, pose_ori, scales, detect_resolution, image_resolution): - body_pose = copy.deepcopy(pose_ori['bodies']['candidate']) - hands = copy.deepcopy(pose_ori['hands']) - faces = copy.deepcopy(pose_ori['faces']) - - ''' - 计算逻辑: - 0. 该函数内进行绝对变换,始终保持人体中心点 body_pose[1] 不变 - 1. 先把 ref 和 pose 的高 resize 到一样,且都保持原来的长宽比。 - 2. 用点在图中的实际坐标来计算。 - 3. 实际计算中,把h的坐标归一化到 [0, 1], w为[0, W/H] - 4. 由于 dwpose 的输出本来就是归一化的坐标,所以h不需要变,w要乘W/H - 注意:dwpose 输出是 (w, h) - ''' + hands, faces = copy.deepcopy(pose_ori['hands']), copy.deepcopy(pose_ori['faces']) - # h不变,w缩放到原比例 - H_in, W_in, C_in = img.shape + H_in, W_in = img.shape[:2] video_ratio = W_in / H_in - body_pose[:, 0] = body_pose[:, 0] * video_ratio - hands[:, :, 0] = hands[:, :, 0] * video_ratio - faces[:, :, 0] = faces[:, :, 0] * video_ratio - - # scales of 10 body parts - scale_neck = scales["scale_neck"] - scale_face = scales["scale_face"] - scale_shoulder = scales["scale_shoulder"] - scale_arm_upper = scales["scale_arm_upper"] - scale_arm_lower = scales["scale_arm_lower"] - scale_hand = scales["scale_hand"] - scale_body_len = scales["scale_body_len"] - scale_leg_upper = scales["scale_leg_upper"] - scale_leg_lower = scales["scale_leg_lower"] - - scale_sum = 0 - count = 0 - scale_list = [scale_neck, scale_face, scale_shoulder, scale_arm_upper, scale_arm_lower, scale_hand, scale_body_len, scale_leg_upper, scale_leg_lower] - for i in range(len(scale_list)): - if not np.isinf(scale_list[i]): - scale_sum = scale_sum + scale_list[i] - count = count + 1 - for i in range(len(scale_list)): - if np.isinf(scale_list[i]): - scale_list[i] = scale_sum/count - - - - # offsets of each part - offset = dict() - offset["14_15_16_17_to_0"] = body_pose[[14,15,16,17], :] - body_pose[[0], :] - offset["3_to_2"] = body_pose[[3], :] - body_pose[[2], :] - offset["4_to_3"] = body_pose[[4], :] - body_pose[[3], :] - offset["6_to_5"] = body_pose[[6], :] - body_pose[[5], :] - offset["7_to_6"] = body_pose[[7], :] - body_pose[[6], :] - offset["9_to_8"] = body_pose[[9], :] - body_pose[[8], :] - offset["10_to_9"] = body_pose[[10], :] - body_pose[[9], :] - offset["12_to_11"] = body_pose[[12], :] - body_pose[[11], :] - offset["13_to_12"] = body_pose[[13], :] - body_pose[[12], :] - offset["hand_left_to_4"] = hands[1, :, :] - body_pose[[4], :] - offset["hand_right_to_7"] = hands[0, :, :] - body_pose[[7], :] - - # neck - c_ = body_pose[1] - cx = c_[0] - cy = c_[1] - M = cv2.getRotationMatrix2D((cx,cy), 0, scale_neck) - - neck = body_pose[[0], :] - neck = warpAffine_kps(neck, M) - body_pose[[0], :] = neck - - # body_pose_up_shoulder - c_ = body_pose[0] - cx = c_[0] - cy = c_[1] - M = cv2.getRotationMatrix2D((cx,cy), 0, scale_face) - - body_pose_up_shoulder = offset["14_15_16_17_to_0"] + body_pose[[0], :] - body_pose_up_shoulder = warpAffine_kps(body_pose_up_shoulder, M) - body_pose[[14,15,16,17], :] = body_pose_up_shoulder - - # shoulder - c_ = body_pose[1] - cx = c_[0] - cy = c_[1] - M = cv2.getRotationMatrix2D((cx,cy), 0, scale_shoulder) - - body_pose_shoulder = body_pose[[2,5], :] - body_pose_shoulder = warpAffine_kps(body_pose_shoulder, M) - body_pose[[2,5], :] = body_pose_shoulder - - # arm upper left - c_ = body_pose[2] - cx = c_[0] - cy = c_[1] - M = cv2.getRotationMatrix2D((cx,cy), 0, scale_arm_upper) - - elbow = offset["3_to_2"] + body_pose[[2], :] - elbow = warpAffine_kps(elbow, M) - body_pose[[3], :] = elbow - - # arm lower left - c_ = body_pose[3] - cx = c_[0] - cy = c_[1] - M = cv2.getRotationMatrix2D((cx,cy), 0, scale_arm_lower) - - wrist = offset["4_to_3"] + body_pose[[3], :] - wrist = warpAffine_kps(wrist, M) - body_pose[[4], :] = wrist - - # hand left - c_ = body_pose[4] - cx = c_[0] - cy = c_[1] - M = cv2.getRotationMatrix2D((cx,cy), 0, scale_hand) - - hand = offset["hand_left_to_4"] + body_pose[[4], :] - hand = warpAffine_kps(hand, M) - hands[1, :, :] = hand - - # arm upper right - c_ = body_pose[5] - cx = c_[0] - cy = c_[1] - M = cv2.getRotationMatrix2D((cx,cy), 0, scale_arm_upper) - - elbow = offset["6_to_5"] + body_pose[[5], :] - elbow = warpAffine_kps(elbow, M) - body_pose[[6], :] = elbow - - # arm lower right - c_ = body_pose[6] - cx = c_[0] - cy = c_[1] - M = cv2.getRotationMatrix2D((cx,cy), 0, scale_arm_lower) - - wrist = offset["7_to_6"] + body_pose[[6], :] - wrist = warpAffine_kps(wrist, M) - body_pose[[7], :] = wrist - - # hand right - c_ = body_pose[7] - cx = c_[0] - cy = c_[1] - M = cv2.getRotationMatrix2D((cx,cy), 0, scale_hand) - - hand = offset["hand_right_to_7"] + body_pose[[7], :] - hand = warpAffine_kps(hand, M) - hands[0, :, :] = hand - - # body len - c_ = body_pose[1] - cx = c_[0] - cy = c_[1] - M = cv2.getRotationMatrix2D((cx,cy), 0, scale_body_len) - - body_len = body_pose[[8,11], :] - body_len = warpAffine_kps(body_len, M) - body_pose[[8,11], :] = body_len - - # leg upper left - c_ = body_pose[8] - cx = c_[0] - cy = c_[1] - M = cv2.getRotationMatrix2D((cx,cy), 0, scale_leg_upper) - - knee = offset["9_to_8"] + body_pose[[8], :] - knee = warpAffine_kps(knee, M) - body_pose[[9], :] = knee - - # leg lower left - c_ = body_pose[9] - cx = c_[0] - cy = c_[1] - M = cv2.getRotationMatrix2D((cx,cy), 0, scale_leg_lower) - - ankle = offset["10_to_9"] + body_pose[[9], :] - ankle = warpAffine_kps(ankle, M) - body_pose[[10], :] = ankle - - # leg upper right - c_ = body_pose[11] - cx = c_[0] - cy = c_[1] - M = cv2.getRotationMatrix2D((cx,cy), 0, scale_leg_upper) - - knee = offset["12_to_11"] + body_pose[[11], :] - knee = warpAffine_kps(knee, M) - body_pose[[12], :] = knee - - # leg lower right - c_ = body_pose[12] - cx = c_[0] - cy = c_[1] - M = cv2.getRotationMatrix2D((cx,cy), 0, scale_leg_lower) - - ankle = offset["13_to_12"] + body_pose[[12], :] - ankle = warpAffine_kps(ankle, M) - body_pose[[13], :] = ankle - - # none part - body_pose_none = pose_ori['bodies']['candidate'] == -1. - hands_none = pose_ori['hands'] == -1. - faces_none = pose_ori['faces'] == -1. - - body_pose[body_pose_none] = -1. - hands[hands_none] = -1. - nan = float('nan') - if len(hands[np.isnan(hands)]) > 0: - print('nan') - faces[faces_none] = -1. - - # last check nan -> -1. - body_pose = np.nan_to_num(body_pose, nan=-1.) - hands = np.nan_to_num(hands, nan=-1.) - faces = np.nan_to_num(faces, nan=-1.) - - # return + for keypoints in [body_pose, hands, faces]: + keypoints[..., 0] *= video_ratio + + # Handle scales + scale_list = [scales.get(f"scale_{part}", 1.0) for part in + ["neck", "face", "shoulder", "arm_upper", "arm_lower", "hand", "body_len", "leg_upper", "leg_lower"]] + avg_scale = np.nanmean([s for s in scale_list if not np.isinf(s)]) + scale_list = [s if not np.isinf(s) else avg_scale for s in scale_list] + + # Precompute offsets for body parts + offset = {f"{i}_to_{j}": body_pose[i] - body_pose[j] for i, j in + [(14, 0), (15, 0), (16, 0), (17, 0), (3, 2), (4, 3), (6, 5), (7, 6), (9, 8), (10, 9), (12, 11), (13, 12)]} + offset.update({"hand_left_to_4": hands[1] - body_pose[4], "hand_right_to_7": hands[0] - body_pose[7]}) + + def apply_transformation(points, center, scale): + M = cv2.getRotationMatrix2D(tuple(center), 0, scale) + return warpAffine_kps(points, M) + + body_pose[0] = apply_transformation(body_pose[0], body_pose[1], scale_list[0]) + body_pose[[14, 15, 16, 17]] = apply_transformation(body_pose[[14, 15, 16, 17]], body_pose[0], scale_list[1]) + body_pose[[2, 5]] = apply_transformation(body_pose[[2, 5]], body_pose[1], scale_list[2]) + + # Simplifying repetitive transformations by grouping transformations for arms and legs + def transform_limb(start_idx, end_idx, scale_upper, scale_lower, hand_idx=None): + body_pose[start_idx] = apply_transformation(body_pose[start_idx], body_pose[end_idx], scale_upper) + body_pose[end_idx] = apply_transformation(body_pose[end_idx], body_pose[start_idx], scale_lower) + if hand_idx is not None: + hands[hand_idx] = apply_transformation(hands[hand_idx], body_pose[end_idx], scale_list[5]) + + transform_limb(2, 3, scale_list[3], scale_list[4], hand_idx=1) + transform_limb(5, 6, scale_list[3], scale_list[4], hand_idx=0) + + # Apply to legs similarly + transform_limb(8, 9, scale_list[7], scale_list[8]) + transform_limb(11, 12, scale_list[7], scale_list[8]) + + # Filter NaN and prepare the output pose_align = copy.deepcopy(pose_ori) - pose_align['bodies']['candidate'] = body_pose - pose_align['hands'] = hands - pose_align['faces'] = faces + for key in ['bodies', 'hands', 'faces']: + pose_align[key] = np.nan_to_num(pose_ori[key], nan=-1.0) return pose_align +def process_video_frame(detector, img, refer_pose, align_args, offset, detect_resolution, image_resolution): + _, pose_ori = detector(img, detect_resolution, image_resolution, output_type='cv2', return_pose_dict=True) + pose_align = align_img(img, pose_ori, align_args, detect_resolution, image_resolution) + pose_align['bodies']['candidate'] += offset + pose_align['hands'] += offset + pose_align['faces'] += offset + # Scale back w to normalized coordinates + pose_align['bodies']['candidate'][..., 0] /= img.shape[1] / img.shape[0] + return pose_align -def run_align_video_with_filterPose_translate_smooth(args): - - vidfn=args.vidfn - imgfn_refer=args.imgfn_refer - outfn=args.outfn - - video = cv2.VideoCapture(vidfn) - width= video.get(cv2.CAP_PROP_FRAME_WIDTH) - height= video.get(cv2.CAP_PROP_FRAME_HEIGHT) - - total_frame= video.get(cv2.CAP_PROP_FRAME_COUNT) - fps= video.get(cv2.CAP_PROP_FPS) - - print("height:", height) - print("width:", width) - print("fps:", fps) - - H_in, W_in = height, width - H_out, W_out = size_calculate(H_in,W_in,args.detect_resolution) - H_out, W_out = size_calculate(H_out,W_out,args.image_resolution) +def run_align_video(args): + video = cv2.VideoCapture(args.vidfn) + total_frame, fps = video.get(cv2.CAP_PROP_FRAME_COUNT), video.get(cv2.CAP_PROP_FPS) - device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') detector = DWposeDetector( - det_config = args.yolox_config, - det_ckpt = args.yolox_ckpt, - pose_config = args.dwpose_config, - pose_ckpt = args.dwpose_ckpt, - keypoints_only=False - ) - detector = detector.to(device) - - refer_img = cv2.imread(imgfn_refer) - output_refer, pose_refer = detector(refer_img,detect_resolution=args.detect_resolution, image_resolution=args.image_resolution, output_type='cv2',return_pose_dict=True) - body_ref_img = pose_refer['bodies']['candidate'] - hands_ref_img = pose_refer['hands'] - faces_ref_img = pose_refer['faces'] - output_refer = cv2.cvtColor(output_refer, cv2.COLOR_RGB2BGR) - - - skip_frames = args.align_frame - max_frame = args.max_frame - pose_list, video_frame_buffer, video_pose_buffer = [], [], [] + det_config=args.yolox_config, det_ckpt=args.yolox_ckpt, + pose_config=args.dwpose_config, pose_ckpt=args.dwpose_ckpt, keypoints_only=False).to('cuda' if torch.cuda.is_available() else 'cpu') + refer_img = cv2.imread(args.imgfn_refer) + _, pose_refer = detector(refer_img, args.detect_resolution, args.image_resolution, output_type='cv2', return_pose_dict=True) - cap = cv2.VideoCapture('2.mp4') # 读取视频 - while cap.isOpened(): # 当视频被打开时: - ret, frame = cap.read() # 读取视频,读取到的某一帧存储到frame,若是读取成功,ret为True,反之为False - if ret: # 若是读取成功 - cv2.imshow('frame', frame) # 显示读取到的这一帧画面 - key = cv2.waitKey(25) # 等待一段时间,并且检测键盘输入 - if key == ord('q'): # 若是键盘输入'q',则退出,释放视频 - cap.release() # 释放视频 - break - else: - cap.release() - cv2.destroyAllWindows() # 关闭所有窗口 - + max_frame, align_frame = args.max_frame, args.align_frame + video_frame_buffer, pose_list = [], [] + + # Precompute alignment scales and offsets + _, pose_1st_img = detector(refer_img, args.detect_resolution, args.image_resolution, output_type='cv2', return_pose_dict=True) + align_args, offset = compute_alignment_parameters(pose_1st_img, pose_refer) for i in range(max_frame): ret, img = video.read() - if img is None: - break - else: - if i < skip_frames: - continue - video_frame_buffer.append(img) - - - - # estimate scale parameters by the 1st frame in the video - if i==skip_frames: - output_1st_img, pose_1st_img = detector(img, args.detect_resolution, args.image_resolution, output_type='cv2', return_pose_dict=True) - body_1st_img = pose_1st_img['bodies']['candidate'] - hands_1st_img = pose_1st_img['hands'] - faces_1st_img = pose_1st_img['faces'] - - ''' - 计算逻辑: - 1. 先把 ref 和 pose 的高 resize 到一样,且都保持原来的长宽比。 - 2. 用点在图中的实际坐标来计算。 - 3. 实际计算中,把h的坐标归一化到 [0, 1], w为[0, W/H] - 4. 由于 dwpose 的输出本来就是归一化的坐标,所以h不需要变,w要乘W/H - 注意:dwpose 输出是 (w, h) - ''' - - # h不变,w缩放到原比例 - ref_H, ref_W = refer_img.shape[0], refer_img.shape[1] - ref_ratio = ref_W / ref_H - body_ref_img[:, 0] = body_ref_img[:, 0] * ref_ratio - hands_ref_img[:, :, 0] = hands_ref_img[:, :, 0] * ref_ratio - faces_ref_img[:, :, 0] = faces_ref_img[:, :, 0] * ref_ratio - - video_ratio = width / height - body_1st_img[:, 0] = body_1st_img[:, 0] * video_ratio - hands_1st_img[:, :, 0] = hands_1st_img[:, :, 0] * video_ratio - faces_1st_img[:, :, 0] = faces_1st_img[:, :, 0] * video_ratio - - # scale - align_args = dict() - - dist_1st_img = np.linalg.norm(body_1st_img[0]-body_1st_img[1]) # 0.078 - dist_ref_img = np.linalg.norm(body_ref_img[0]-body_ref_img[1]) # 0.106 - align_args["scale_neck"] = dist_ref_img / dist_1st_img # align / pose = ref / 1st - - dist_1st_img = np.linalg.norm(body_1st_img[16]-body_1st_img[17]) - dist_ref_img = np.linalg.norm(body_ref_img[16]-body_ref_img[17]) - align_args["scale_face"] = dist_ref_img / dist_1st_img - - dist_1st_img = np.linalg.norm(body_1st_img[2]-body_1st_img[5]) # 0.112 - dist_ref_img = np.linalg.norm(body_ref_img[2]-body_ref_img[5]) # 0.174 - align_args["scale_shoulder"] = dist_ref_img / dist_1st_img - - dist_1st_img = np.linalg.norm(body_1st_img[2]-body_1st_img[3]) # 0.895 - dist_ref_img = np.linalg.norm(body_ref_img[2]-body_ref_img[3]) # 0.134 - s1 = dist_ref_img / dist_1st_img - dist_1st_img = np.linalg.norm(body_1st_img[5]-body_1st_img[6]) - dist_ref_img = np.linalg.norm(body_ref_img[5]-body_ref_img[6]) - s2 = dist_ref_img / dist_1st_img - align_args["scale_arm_upper"] = (s1+s2)/2 # 1.548 - - dist_1st_img = np.linalg.norm(body_1st_img[3]-body_1st_img[4]) - dist_ref_img = np.linalg.norm(body_ref_img[3]-body_ref_img[4]) - s1 = dist_ref_img / dist_1st_img - dist_1st_img = np.linalg.norm(body_1st_img[6]-body_1st_img[7]) - dist_ref_img = np.linalg.norm(body_ref_img[6]-body_ref_img[7]) - s2 = dist_ref_img / dist_1st_img - align_args["scale_arm_lower"] = (s1+s2)/2 - - # hand - dist_1st_img = np.zeros(10) - dist_ref_img = np.zeros(10) - - dist_1st_img[0] = np.linalg.norm(hands_1st_img[0,0]-hands_1st_img[0,1]) - dist_1st_img[1] = np.linalg.norm(hands_1st_img[0,0]-hands_1st_img[0,5]) - dist_1st_img[2] = np.linalg.norm(hands_1st_img[0,0]-hands_1st_img[0,9]) - dist_1st_img[3] = np.linalg.norm(hands_1st_img[0,0]-hands_1st_img[0,13]) - dist_1st_img[4] = np.linalg.norm(hands_1st_img[0,0]-hands_1st_img[0,17]) - dist_1st_img[5] = np.linalg.norm(hands_1st_img[1,0]-hands_1st_img[1,1]) - dist_1st_img[6] = np.linalg.norm(hands_1st_img[1,0]-hands_1st_img[1,5]) - dist_1st_img[7] = np.linalg.norm(hands_1st_img[1,0]-hands_1st_img[1,9]) - dist_1st_img[8] = np.linalg.norm(hands_1st_img[1,0]-hands_1st_img[1,13]) - dist_1st_img[9] = np.linalg.norm(hands_1st_img[1,0]-hands_1st_img[1,17]) - - dist_ref_img[0] = np.linalg.norm(hands_ref_img[0,0]-hands_ref_img[0,1]) - dist_ref_img[1] = np.linalg.norm(hands_ref_img[0,0]-hands_ref_img[0,5]) - dist_ref_img[2] = np.linalg.norm(hands_ref_img[0,0]-hands_ref_img[0,9]) - dist_ref_img[3] = np.linalg.norm(hands_ref_img[0,0]-hands_ref_img[0,13]) - dist_ref_img[4] = np.linalg.norm(hands_ref_img[0,0]-hands_ref_img[0,17]) - dist_ref_img[5] = np.linalg.norm(hands_ref_img[1,0]-hands_ref_img[1,1]) - dist_ref_img[6] = np.linalg.norm(hands_ref_img[1,0]-hands_ref_img[1,5]) - dist_ref_img[7] = np.linalg.norm(hands_ref_img[1,0]-hands_ref_img[1,9]) - dist_ref_img[8] = np.linalg.norm(hands_ref_img[1,0]-hands_ref_img[1,13]) - dist_ref_img[9] = np.linalg.norm(hands_ref_img[1,0]-hands_ref_img[1,17]) - - ratio = 0 - count = 0 - for i in range (10): - if dist_1st_img[i] != 0: - ratio = ratio + dist_ref_img[i]/dist_1st_img[i] - count = count + 1 - if count!=0: - align_args["scale_hand"] = (ratio/count+align_args["scale_arm_upper"]+align_args["scale_arm_lower"])/3 - else: - align_args["scale_hand"] = (align_args["scale_arm_upper"]+align_args["scale_arm_lower"])/2 - - # body - dist_1st_img = np.linalg.norm(body_1st_img[1] - (body_1st_img[8] + body_1st_img[11])/2 ) - dist_ref_img = np.linalg.norm(body_ref_img[1] - (body_ref_img[8] + body_ref_img[11])/2 ) - align_args["scale_body_len"]=dist_ref_img / dist_1st_img + if not ret or i < align_frame: + continue - dist_1st_img = np.linalg.norm(body_1st_img[8]-body_1st_img[9]) - dist_ref_img = np.linalg.norm(body_ref_img[8]-body_ref_img[9]) - s1 = dist_ref_img / dist_1st_img - dist_1st_img = np.linalg.norm(body_1st_img[11]-body_1st_img[12]) - dist_ref_img = np.linalg.norm(body_ref_img[11]-body_ref_img[12]) - s2 = dist_ref_img / dist_1st_img - align_args["scale_leg_upper"] = (s1+s2)/2 + video_frame_buffer.append(img) + pose_align = process_video_frame(detector, img, pose_refer, align_args, offset, args.detect_resolution, args.image_resolution) + pose_list.append(pose_align) - dist_1st_img = np.linalg.norm(body_1st_img[9]-body_1st_img[10]) - dist_ref_img = np.linalg.norm(body_ref_img[9]-body_ref_img[10]) - s1 = dist_ref_img / dist_1st_img - dist_1st_img = np.linalg.norm(body_1st_img[12]-body_1st_img[13]) - dist_ref_img = np.linalg.norm(body_ref_img[12]-body_ref_img[13]) - s2 = dist_ref_img / dist_1st_img - align_args["scale_leg_lower"] = (s1+s2)/2 + # Create output videos + generate_output_videos(video_frame_buffer, pose_list, args.outfn, fps) - #################### - #################### - # need adjust nan - for k,v in align_args.items(): - if np.isnan(v): - align_args[k]=1 +def compute_alignment_parameters(pose_1st_img, pose_refer): + body_1st_img, body_ref_img = pose_1st_img['bodies']['candidate'], pose_refer['bodies']['candidate'] - # centre offset (the offset of key point 1) - offset = body_ref_img[1] - body_1st_img[1] - - - # pose align - pose_img, pose_ori = detector(img, args.detect_resolution, args.image_resolution, output_type='cv2', return_pose_dict=True) - video_pose_buffer.append(pose_img) - pose_align = align_img(img, pose_ori, align_args, args.detect_resolution, args.image_resolution) - - - # add centre offset - pose = pose_align - pose['bodies']['candidate'] = pose['bodies']['candidate'] + offset - pose['hands'] = pose['hands'] + offset - pose['faces'] = pose['faces'] + offset - - - # h不变,w从绝对坐标缩放回0-1 注意这里要回到ref的坐标系 - pose['bodies']['candidate'][:, 0] = pose['bodies']['candidate'][:, 0] / ref_ratio - pose['hands'][:, :, 0] = pose['hands'][:, :, 0] / ref_ratio - pose['faces'][:, :, 0] = pose['faces'][:, :, 0] / ref_ratio - pose_list.append(pose) - - # stack - body_list = [pose['bodies']['candidate'][:18] for pose in pose_list] - body_list_subset = [pose['bodies']['subset'][:1] for pose in pose_list] - hands_list = [pose['hands'][:2] for pose in pose_list] - faces_list = [pose['faces'][:1] for pose in pose_list] - - body_seq = np.stack(body_list , axis=0) - body_seq_subset = np.stack(body_list_subset, axis=0) - hands_seq = np.stack(hands_list , axis=0) - faces_seq = np.stack(faces_list , axis=0) + # Calculate scales + scale_neck = np.linalg.norm(body_ref_img[0] - body_ref_img[1]) / np.linalg.norm(body_1st_img[0] - body_1st_img[1]) + scale_face = np.linalg.norm(body_ref_img[16] - body_ref_img[17]) / np.linalg.norm(body_1st_img[16] - body_1st_img[17]) + offset = body_ref_img[1] - body_1st_img[1] + align_args = {"scale_neck": scale_neck, "scale_face": scale_face} + return align_args, offset - # concatenate and paint results - H = 768 # paint height - W1 = int((H/ref_H * ref_W)//2 *2) - W2 = int((H/height * width)//2 *2) - result_demo = [] # = Writer(args, None, H, 3*W1+2*W2, outfn, fps) - result_pose_only = [] # Writer(args, None, H, W1, args.outfn_align_pose_video, fps) - for i in range(len(body_seq)): - pose_t={} - pose_t["bodies"]={} - pose_t["bodies"]["candidate"]=body_seq[i] - pose_t["bodies"]["subset"]=body_seq_subset[i] - pose_t["hands"]=hands_seq[i] - pose_t["faces"]=faces_seq[i] - - ref_img = cv2.cvtColor(refer_img, cv2.COLOR_RGB2BGR) - ref_img = cv2.resize(ref_img, (W1, H)) - ref_pose= cv2.resize(output_refer, (W1, H)) - - output_transformed = draw_pose( - pose_t, - int(H_in*1024/W_in), - 1024, - draw_face=False, - ) - output_transformed = cv2.cvtColor(output_transformed, cv2.COLOR_BGR2RGB) - output_transformed = cv2.resize(output_transformed, (W1, H)) - - video_frame = cv2.resize(video_frame_buffer[i], (W2, H)) - video_pose = cv2.resize(video_pose_buffer[i], (W2, H)) - - res = np.concatenate([ref_img, ref_pose, output_transformed, video_frame, video_pose], axis=1) - result_demo.append(res) - result_pose_only.append(output_transformed) - - print(f"pose_list len: {len(pose_list)}") +def generate_output_videos(video_frames, poses, outfn, fps): + result_demo = [draw_pose(pose, *video_frame.shape[:2]) for pose, video_frame in zip(poses, video_frames)] clip = moviepy.video.io.ImageSequenceClip.ImageSequenceClip(result_demo, fps=fps) clip.write_videofile(outfn, fps=fps) - clip = moviepy.video.io.ImageSequenceClip.ImageSequenceClip(result_pose_only, fps=fps) - clip.write_videofile(args.outfn_align_pose_video, fps=fps) - print('pose align done') - - def main(): parser = argparse.ArgumentParser() - parser.add_argument('--detect_resolution', type=int, default=512, help='detect_resolution') - parser.add_argument('--image_resolution', type=int, default=720, help='image_resolution') - - parser.add_argument("--yolox_config", type=str, default="./pose/config/yolox_l_8xb8-300e_coco.py") - parser.add_argument("--dwpose_config", type=str, default="./pose/config/dwpose-l_384x288.py") - parser.add_argument("--yolox_ckpt", type=str, default="./pretrained_weights/dwpose/yolox_l_8x8_300e_coco.pth") - parser.add_argument("--dwpose_ckpt", type=str, default="./pretrained_weights/dwpose/dw-ll_ucoco_384.pth") - - - parser.add_argument('--align_frame', type=int, default=0, help='the frame index of the video to align') - parser.add_argument('--max_frame', type=int, default=300, help='maximum frame number of the video to align') - parser.add_argument('--imgfn_refer', type=str, default="./assets/images/0.jpg", help='refer image path') - parser.add_argument('--vidfn', type=str, default="./assets/videos/0.mp4", help='Input video path') - parser.add_argument('--outfn_align_pose_video', type=str, default=None, help='output path of the aligned video of the refer img') - parser.add_argument('--outfn', type=str, default=None, help='Output path of the alignment visualization') - args = parser.parse_args() + parser.add_argument('--detect_resolution', type=int, default=512) + parser.add_argument('--image_resolution', type=int, default=720) + parser.add_argument('--yolox_config', type=str, default="./pose/config/yolox_l_8xb8-300e_coco.py") + parser.add_argument('--dwpose_config', type=str, default="./pose/config/dwpose-l_384x288.py") + parser.add_argument('--yolox_ckpt', type=str, default="./pretrained_weights/dwpose/yolox_l_8x8_300e_coco.pth") + parser.add_argument('--dwpose_ckpt', type=str, default="./pretrained_weights/dwpose/dw-ll_ucoco_384.pth") + parser.add_argument('--align_frame', type=int, default=0) + parser.add_argument('--max_frame', type=int, default=300) + parser.add_argument('--imgfn_refer', type=str, default="./assets/images/0.jpg") + parser.add_argument('--vidfn', type=str, default="./assets/videos/0.mp4") + parser.add_argument('--outfn', type=str, default="./output_video.mp4") - if not os.path.exists("./assets/poses/align"): - # os.makedirs("./assets/poses/") - os.makedirs("./assets/poses/align") - os.makedirs("./assets/poses/align_demo") - - img_name = os.path.basename(args.imgfn_refer).split('.')[0] - video_name = os.path.basename(args.vidfn).split('.')[0] - if args.outfn_align_pose_video is None: - args.outfn_align_pose_video = "./assets/poses/align/img_{}_video_{}.mp4".format(img_name, video_name) - if args.outfn is None: - args.outfn = "./assets/poses/align_demo/img_{}_video_{}.mp4".format(img_name, video_name) - - run_align_video_with_filterPose_translate_smooth(args) - + args = parser.parse_args() + run_align_video(args) - if __name__ == '__main__': main()