#!/usr/bin/env python # coding: utf-8 # # Stereo Reconstruction # ## Done by: Projit and Abhinav # (Projit generated the 3D point cloud and computed the 2D-3D correspondences and Abhinav generated the disparity map for the stereo pairs) # # Our objective generate a 3D point cloud reconstruction of a scene from pairs of stero images. The data includes a set of stereo image pairs, from which we calculate the disparity and generate the point cloud from the disparity map. # As usual, we first load the libraries we would require # In[1]: import cv2 import numpy as np import matplotlib.pyplot as plt import open3d as o3d import os # A function to generate the .ply file, so that the point cloud can be visualized. # In[2]: def create_output(vertices, colors, filename): # taken from github colors = colors.reshape(-1,3) vertices = np.hstack([vertices.reshape(-1,3),colors]) ply_header = '''ply format ascii 1.0 element vertex %(vert_num)d property float x property float y property float z property uchar red property uchar green property uchar blue end_header ''' with open(filename, 'w') as f: f.write(ply_header %dict(vert_num=len(vertices))) np.savetxt(f,vertices,'%f %f %f %d %d %d') # ## Loading the given data # - We have been given stereo pairs, the K matrix, the baseline and the ground truth poses, which we load into our program. # # - We construct our 3D point cloud from only one stereo pair, although we can use more by altering the start and end variables. Adjust start and end to change number of pictures and which pictures to select. Ensure start < end. # In[7]: lst = os.listdir('./data/img2') start = 0 end = 1 imgs = sorted(lst)[start:end] print(imgs) # In[8]: pose = np.array([ np.array([[-9.098548e-01,5.445376e-02,-4.113381e-01,-1.872835e+02],[4.117828e-02,9.983072e-01,4.107410e-02,1.870218e+00],[4.128785e-01,2.043327e-02,-9.105569e-01,5.417085e+01]]), np.array([[-9.283201e-01,4.509830e-02,-3.690366e-01,-1.874227e+02],[3.357747e-02,9.987291e-01,3.758534e-02,1.883289e+00],[3.702626e-01,2.249990e-02,-9.286546e-01,5.369416e+01]]), np.array([[-9.452512e-01,3.706129e-02,-3.242324e-01,-1.875256e+02],[2.776133e-02,9.990610e-01,3.326342e-02,1.887417e+00],[3.251607e-01,2.244117e-02,-9.453925e-01,5.321309e+01]]), np.array([[-9.602059e-01,3.410654e-02,-2.772028e-01,-1.876307e+02],[2.674480e-02,9.991831e-01,3.029615e-02,1.893359e+00],[2.780097e-01,2.167680e-02,-9.603337e-01,5.273710e+01]]), np.array([[-9.729422e-01,3.276894e-02,-2.287129e-01,-1.876915e+02],[2.667847e-02,9.992036e-01,2.967147e-02,1.900316e+00],[2.295031e-01,2.276691e-02,-9.730416e-01,5.225276e+01]]), np.array([[-9.831529e-01,3.240420e-02,-1.798899e-01,-1.877485e+02],[2.738709e-02,9.991654e-01,3.030454e-02,1.901338e+00],[1.807218e-01,2.486734e-02,-9.832198e-01,5.177232e+01]]), np.array([[-9.911510e-01,2.523874e-02,-1.303180e-01,-1.877723e+02],[2.111370e-02,9.992343e-01,3.293915e-02,1.900778e+00],[1.310496e-01,2.989617e-02,-9.909249e-01,5.129098e+01]]), np.array([[-9.966637e-01,1.446740e-02,-8.032513e-02,-1.877414e+02],[1.146235e-02,9.992216e-01,3.774707e-02,1.901606e+00],[8.080871e-02,3.670042e-02,-9.960537e-01,5.080341e+01]]), np.array([[-9.995040e-01,1.006074e-02,-2.984320e-02,-1.877165e+02],[8.815319e-03,9.990965e-01,4.157437e-02,1.903912e+00],[3.023451e-02,4.129067e-02,-9.986896e-01,5.033042e+01]]), np.array([[-9.996705e-01,1.307490e-02,2.209183e-02,-1.876828e+02],[1.395603e-02,9.990937e-01,4.021250e-02,1.909578e+00],[-2.154603e-02,4.050756e-02,-9.989469e-01,4.986716e+01]]), np.array([[-9.970705e-01,2.051987e-02,7.368440e-02,-1.876045e+02],[2.335961e-02,9.990090e-01,3.788635e-02,1.913879e+00],[-7.283396e-02,3.949659e-02,-9.965617e-01,4.940375e+01]]), np.array([[-9.919546e-01,2.316199e-02,1.244574e-01,-1.875198e+02],[2.767742e-02,9.990153e-01,3.467489e-02,1.916071e+00],[-1.235318e-01,3.784057e-02,-9.916189e-01,4.895472e+01]]), np.array([[-9.843986e-01,2.362611e-02,1.743596e-01,-1.873856e+02],[2.975169e-02,9.990255e-01,3.260177e-02,1.916040e+00],[-1.734194e-01,3.728062e-02,-9.841422e-01,4.850872e+01]]), np.array([[-9.745046e-01,2.207656e-02,2.232787e-01,-1.872454e+02],[2.939030e-02,9.991330e-01,2.948581e-02,1.919502e+00],[-2.224342e-01,3.529628e-02,-9.743086e-01,4.808322e+01]]), np.array([[-9.620042e-01,2.331542e-02,2.720374e-01,-1.870919e+02],[3.162970e-02,9.991557e-01,2.621754e-02,1.921924e+00],[-2.711965e-01,3.382584e-02,-9.619295e-01,4.766758e+01]]), np.array([[-9.466844e-01,2.747540e-02,3.209888e-01,-1.869050e+02],[3.735473e-02,9.989978e-01,2.465901e-02,1.925132e+00],[-3.199896e-01,3.533474e-02,-9.467619e-01,4.724980e+01]]), np.array([[-9.303521e-01,3.021656e-02,3.654203e-01,-1.867142e+02],[4.116827e-02,9.989052e-01,2.221412e-02,1.922721e+00],[-3.643491e-01,3.571067e-02,-9.305775e-01,4.684089e+01]]), np.array([[-9.132370e-01,2.475193e-02,4.066762e-01,-1.864782e+02],[3.739334e-02,9.990320e-01,2.316585e-02,1.917552e+00],[-4.057092e-01,3.636289e-02,-9.132786e-01,4.643056e+01]]), np.array([[-8.953418e-01,1.994046e-02,4.449332e-01,-1.862433e+02],[3.564541e-02,9.990008e-01,2.695748e-02,1.910636e+00],[-4.439511e-01,3.999598e-02,-8.951580e-01,4.602498e+01]]), np.array([[-8.776813e-01,1.983338e-02,4.788341e-01,-1.859999e+02],[3.901401e-02,9.987839e-01,3.014110e-02,1.906858e+00],[-4.776541e-01,4.513551e-02,-8.773878e-01,4.562057e+01]]), np.array([[-8.608718e-01,2.164481e-02,5.083614e-01,-1.857437e+02],[4.407236e-02,9.985119e-01,3.211896e-02,1.904895e+00],[-5.069097e-01,5.005498e-02,-8.605447e-01,4.521752e+01]]) ]) len(pose) # - We write a function to convert 3D world points by the pose # In[9]: def make_new_pt(pt_3d, C): ''' Funtion to convert 3d world points by the pose. ''' P = np.array([ [C[0][0], C[0][1], C[0][2], C[0][3]], [C[1][0], C[1][1], C[1][2], C[1][3]], [C[2][0], C[2][1], C[2][2], C[2][3]], ]) pt = np.mat([pt_3d[0], pt_3d[1], pt_3d[2], 1]).T mat = np.matmul(P, pt) return mat # - Attempt to check if poses should be stacked or not # - In current form, just applies the pose directly. # - 4x4 matrix useful for taking inverses if required # In[10]: combination = [] A0 = np.vstack([pose[0], np.array([0, 0, 0, 1])]) combination.append(A0) for c in range(1, len(pose)): A = np.vstack([pose[c], np.array([0, 0, 0, 1])]) combination.append(A) final_output = [] final_color = [] output_colors = [] output_points = [] Height = 0 Width = 0 disparity_map = 0 Pixel_Point_Map = {} Pixel_Point_Map_alt = {} Pixel_Point_Map_2 = {} # ## The main loop # ### Computing the disparity map # - We use cv2.StereoSGBM to generate the disparity map for the stereo pair. # - Lot of hyperparameter tuning done to achieve better results. # - Disparity map is divided by 16.0 as it is a scaling factor used by StereoSGBM, which must be accounted for. # In[11]: def main(): global final_output, final_color, output_colors, output_points, disparity_map focal_length = 7.070912e+02 k = np.float32([[7.070912e+02, 0.000000e+00, 6.018873e+02], [0.000000e+00, 7.070912e+02, 1.831104e+02], [0.000000e+00, 0.000000e+00, 1.000000e+00]]) baseline = 0.53790448812 for count, img in enumerate(imgs): print('./img2/' + img, './img3/' + img) iml = cv2.imread('./data/img2/' + img) imr = cv2.imread('./data/img3/' + img) window_size = 3 min_disp = 16 num_disp = 112-min_disp stereo = cv2.StereoSGBM_create(minDisparity = min_disp, numDisparities = num_disp+(2) * 16, blockSize = 7, P1 = 8*3*window_size**2, P2 = 32*3*window_size**2, disp12MaxDiff = 1, uniquenessRatio = 12, speckleWindowSize = 400, speckleRange = 5 ) disparity_map = stereo.compute(iml, imr).astype(np.float32) / 16.0 h, w = iml.shape[:2] Height = h Width = w # Formation of Q matrix to reproject back into 3d Q = np.float32([[1, 0, 0, -0.5*w], [0,-1, 0, 0.5*h], [0, 0, 0, -focal_length], [0, 0, -1/baseline, 0]]) # cv2 way of finding 3d points, overwritten by next part. points = cv2.reprojectImageTo3D(disparity_map, Q) colors = cv2.cvtColor(iml, cv2.COLOR_BGR2RGB) # Hand coding reprojectImageto3D....results weren't as good as cv2 functions # Overwriting colors and points in this process with own implementation Pixel_Point_Map[count+start] = [] Pixel_Point_Map_alt[count+start] = [] Pixel_Point_Map_2[count+start] = [] for i in range(len(iml)): for j in range(len(iml[i])): colors[i][j] = iml[i][j] # Following the equation: Q x [i j disp[i][j] 1].T = [X Y Z] to recover 3d points pt_temp = Q @ np.array([i, j, disparity_map[i][j]/16.0, 1]).T W = pt_temp[3] to_ap = [pt_temp[0]/W, pt_temp[1]/W, pt_temp[2]/W] points[i][j] = to_ap Pixel_Point_Map[count+start].append(([i, j], to_ap)) # Duplicated code(from below), but useful in collecting pixel to world coordinate correspondences for 2nd part. if disparity_map[i][j] > disparity_map.min(): Pixel_Point_Map_alt[count+start].append(([i, j], to_ap)) # This step takes the above project 3d, and transforms them according to the ground truth poses. pt_new_temp = np.array(make_new_pt(to_ap, combination[count+start]).T).astype('float32')[0] pt_add = [pt_new_temp[0], pt_new_temp[1], pt_new_temp[2]] Pixel_Point_Map_2[count+start].append(([i, j], pt_add)) print(points.shape) print(colors.shape) # Duplicated code from above mask = disparity_map > disparity_map.min() output_points = points[mask] output_colors = colors[mask] print(output_points.shape) for i,pt in enumerate(output_points): # Transforming 3d points by the ground truth poses. final_output.append(np.array(make_new_pt(pt, combination[count+start]).T).astype('float32')[0]) final_color.append(output_colors[i]) print("Length of output Points:", len(output_points), "count:", count) final_output = np.array(final_output) final_color = np.array(final_color) print(final_output.shape, final_color.shape) # Constructing .ply file to be viewed by meshlab or o3d. create_output(final_output, final_color, 'jnrecon_mult.ply') main() # In[26]: # Original # Pixel_Point_Map # Alternative, filtering for wrong disparity values # Pixel_Point_Map_alt # Last thing to try, filtering for wrong disparity and transformed by ground truth matrix # Pixel_Point_Map_2 # Dictionary where keys correspond to which image pair we're taking # value is a list of tuples. # Each tuple first element is a list [x, y] of the pixel coordinates # Second element is a list [X, Y, Z] of the 3d world coordinates # Form of the Dictionary: # {0: [([x_p1, y_p1], [X_w1, Y_w1, Z_w1]), ([x_p2, y_p2], [X_w2, Y_w2, Z_w2]), ...] # 1: []...} # In[27]: # No need to use the code below # p3p part #d = (f - Z' * b ) / ( Z' * a) #Ix = X' * ( d * a + b ) + Cx #Iy = Y' * ( d * a + b ) + Cy # Height = 370 # Width = 1226 # def find_impts(X, Y, Z): # f = 7.070912e+02 # baseline = 0.53790448812 # a = -1/baseline # b = 0 # d = (f - Z * b) / (Z * a) # Ix = X * (d*a + b) + 0.5*Width # Iy = Y * (d*a + b) + 0.5*Height # return (int(Iy), int(Ix)) # inv_c = [] # hold_pts = [] # for c in combination: # inv_c.append(np.linalg.inv(c)) # for i, pt in enumerate(final_output): # pose_num = int(i/304696) # temp_pt = np.array([pt[0], pt[1], pt[2], 1]).T # out_pt = np.matmul(inv_c[pose_num], temp_pt) # hold_pts.append(find_impts(out_pt[0], out_pt[1], out_pt[2])) # # Non linear optimization # ## Done by Abhinav and Projit # (Abhinav worked on the code for Gauss Newton and obtained the 2D correspondences, and both Projit and Abhinav derived the Jacobian together) # ## Getting the Projection matrix # - We already have the 3D points generated from the previous part, which we used to render the point cloud. We find the 2D pixel points that correspond to these 3D world points. # - We do so by multiplying each 3D point by the projection matrix, obtained from K, R and t. # - K has been given to us, and so has R|t in poses.txt. # - We take inverse of R|t matrix because we want to go from world to camera. Hence we add a new row to make it 4x4. # - P = KRt # In[8]: # Correct R|t Matrix Q = np.array([ [-9.1e-01, 5.5e-02, -4.2e-01, -1.9e+02], [4.2e-02, 9.983072e-01, 4.2e-02, 1.7e+00], [4.2e-01, 2.1e-02, -9.2e-01, 5.5e+01] ]) row = [0,0,0,1] row = np.array(row) Q=np.vstack((Q,row)) Q=np.linalg.inv(Q) Q = Q[:3, :] # Calibration Matrix as given cal_mat = [[7.070912e+02, 0.000000e+00, 6.018873e+02], [0.000000e+00, 7.070912e+02, 1.831104e+02], [0.000000e+00, 0.000000e+00, 1.000000e+00] ] cal_mat = np.array(cal_mat) # Correct projection matrix, the one we should be getting after GN P = np.matmul(cal_mat, Q) # 3x4 matrix print("Correct projection matrix: \n", P) # ## Generating 2D-3D correspondences # - Now we can find the actual 2D points that correspond to the 3D point by multiplying each 3D point with P. # In[9]: # getting actual 2D points from the 3D points pts_3d = [] # 3D points from the point cloud actual_pts_2d = [] # actual 2D pixel values for i in range(0, 10000, 100): #taking random 100 points and storing them in arrays pt_3d = Pixel_Point_Map_alt[0][i][1] pt_3d = np.append(pt_3d, 1) # 1x4 pts_3d.append(pt_3d) actual_pt_2d = np.matmul(P, pt_3d)#3x4 multiplied with 4x1 gives us a 3x1 point (homogeneous) actual_pt_2d = actual_pt_2d/actual_pt_2d[2] # last element becomes 1 actual_pts_2d.append(actual_pt_2d) pts_3d = np.array(pts_3d) # 100x4 actual_pts_2d = np.array(actual_pts_2d) # 100x3 # ## Gauss-Newton # - Now we run the gauss newton algorithm and try to refine P with every iteration. # - We take an initial estimate of P that is close to the actual value of P, because otherwise, GN would get stuck in local minima. # In[10]: P_est = np.array([ [-8.8e+02,5.4e+01,-2.4e+02,-1.5e+05], [-3.6e+01,7.1e+02,-1.4e+02,1.4e+02], [-4.1e-01,4.1e-02,-8.9e-01,-2.9e+01] ]) # - Now running Gauss Newton algorithm. With every iteration, we try to find a better estimate for P # - Please refer to the handwritten notes attached along with submission about Jacobian # - Every iteration, we do P + delP, calculated from jacobian # In[37]: print("Running Gauss newton: \n ") sum_sq_error = 10000 while True: # number of iterations for the algorithm, to be updated later prev_error = sum_sq_error pts_2d = [] # corresponding 2D location estimated based on our P matrix J = [] for pt_3d in pts_3d: pt_2d = np.matmul(P_est, pt_3d) pt_2d = pt_2d/pt_2d[2] pts_2d.append(pt_2d) error = actual_pts_2d - pts_2d # 100 x 3 error = np.delete(error, 2, axis=1) # 100 x 2 i.e. 100 points and 2 coordinates for each # print("error:", error) sum_sq_error = 0 for obj in error: sq_error = np.matmul(obj.T, obj) sum_sq_error = sum_sq_error + sq_error print("squared sum error:", sum_sq_error) if(abs(sum_sq_error - prev_error) < 0.0001): break #calculating J for pt_3d in pts_3d: X = P_est[0][0]*pt_3d[0] + P_est[0][1]*pt_3d[1] + P_est[0][2]*pt_3d[2] + P_est[0][3] Y = P_est[1][0]*pt_3d[0] + P_est[1][1]*pt_3d[1] + P_est[1][2]*pt_3d[2] + P_est[1][3] Z = P_est[2][0]*pt_3d[0] + P_est[2][1]*pt_3d[1] + P_est[2][2]*pt_3d[2] + P_est[2][3] X1 = pt_3d[0] Y1 = pt_3d[1] Z1 = pt_3d[2] J1 = X1/Z J2 = Y1/Z J3 = Z1/Z J4 = 1/Z J5 = 0 J6 = 0 J7 = 0 J8 = 0 J9 = (-X*X1)/(Z*Z) J10 = (-X*Y1)/(Z*Z) J11 = (-X*Z1)/(Z*Z) J12 = -1/(Z*Z) J13 = 0 J14 = 0 J15 = 0 J16 = 0 J17 = J1 J18 = J2 J19 = J3 J20 = J4 J21 = (-Y*X1)/(Z*Z) J22 = (-Y*Y1)/(Z*Z) J23 = (-Y*Z1)/(Z*Z) J24 = J12 J = np.append(J, J1) J = np.append(J, J2) J = np.append(J, J3) J = np.append(J, J4) J = np.append(J, J5) J = np.append(J, J6) J = np.append(J, J7) J = np.append(J, J8) J = np.append(J, J9) J = np.append(J, J10) J = np.append(J, J11) J = np.append(J, J12) J = np.append(J, J13) J = np.append(J, J14) J = np.append(J, J15) J = np.append(J, J16) J = np.append(J, J17) J = np.append(J, J18) J = np.append(J, J19) J = np.append(J, J20) J = np.append(J, J21) J = np.append(J, J22) J = np.append(J, J23) J = np.append(J, J24) J = J.reshape(200,12) JTJ = np.matmul(J.T, J) JTJ = np.linalg.pinv(JTJ) # 12 x 12 error = np.reshape(error, (200,1)) e = np.matmul(J.T, error) # 12x1 delP = np.matmul(JTJ, e) delP = delP.reshape(3,4) P_est = P_est + delP print("Predicted projection matrix after Gauss Newton: \n") print(P_est) # - We see that the algorithm converges afeter a few steps. # In[ ]: