/** @file * @brief GPU-ICP Algorithm * @author Deyuan Qiu, University of Applied Sciences Bonn-Rhein-Sieg, Sankt Augustin, Germany. * Fraunhofer IAIS, Sankt Augustin, Germany. */ #include "slam6d/cuda/CIcpGpuCuda_kernel.cuh" #include "slam6d/cuda/CIcpGpuCuda.cuh" #include "slam6d/cuda/CSystem.h" void CIcpGpuCuda::init(unsigned unWidth, unsigned unHeight, unsigned max_iter) { // Initialize CUTIL int d; cudaSetDevice(0); // Since we have only one GPU I didn't initialize anything // It may cause problems if more GPUs take into account // The reason of doing that was initialization of it several times // Now it is not initialized several times and just once. unMaxIteration = max_iter; matrices = (Matrix**)malloc(sizeof(Matrix*) * max_iter); for(int i = 0 ; i < max_iter ; ++i){ matrices[i] = new Matrix(4,4); Matrix* m = matrices[i]; (*m)(1,1) = 1;(*m)(1,2) = 0;(*m)(1,3) = 0;(*m)(1,4) = 0; (*m)(2,1) = 0;(*m)(2,2) = 1;(*m)(2,3) = 0;(*m)(2,4) = 0; (*m)(3,1) = 0;(*m)(3,2) = 0;(*m)(3,3) = 1;(*m)(3,4) = 0; (*m)(4,1) = 0;(*m)(4,2) = 0;(*m)(4,3) = 0;(*m)(4,4) = 1; } // set data size setResolution(unWidth, unHeight); // cout<<"unSizeData: "<::allocate(unSizeData, 3, h_idata); //model // Device memory allocation CUDA_SAFE_CALL(cudaMalloc((void**)&fDist, unSizeData*sizeof(float))); CUDA_SAFE_CALL(cudaMalloc((void**)&fDistCpt, unSizeData*sizeof(float))); CUDA_SAFE_CALL(cudaMalloc((void**)&unMask, unSizeData*sizeof(unsigned))); CUDA_SAFE_CALL(cudaMalloc((void**)&fDevMdlPairX, unSizeData*sizeof(float))); //pairs after shrinking CUDA_SAFE_CALL(cudaMalloc((void**)&fDevMdlPairY, unSizeData*sizeof(float))); CUDA_SAFE_CALL(cudaMalloc((void**)&fDevMdlPairZ, unSizeData*sizeof(float))); CUDA_SAFE_CALL(cudaMalloc((void**)&fDevScnPairX, unSizeData*sizeof(float))); CUDA_SAFE_CALL(cudaMalloc((void**)&fDevScnPairY, unSizeData*sizeof(float))); CUDA_SAFE_CALL(cudaMalloc((void**)&fDevScnPairZ, unSizeData*sizeof(float))); /////////////// Added by Shams CUDA_SAFE_CALL(cudaMalloc((void**)&cngfDevScnX,unSizeData*sizeof(float))); CUDA_SAFE_CALL(cudaMalloc((void**)&cngfDevScnY,unSizeData*sizeof(float))); CUDA_SAFE_CALL(cudaMalloc((void**)&cngfDevScnZ,unSizeData*sizeof(float))); CUDA_SAFE_CALL(cudaMalloc((void**)&cngfDevMdlPairX, unSizeData*sizeof(float))); //pairs after shrinking CUDA_SAFE_CALL(cudaMalloc((void**)&cngfDevMdlPairY, unSizeData*sizeof(float))); CUDA_SAFE_CALL(cudaMalloc((void**)&cngfDevMdlPairZ, unSizeData*sizeof(float))); CUDA_SAFE_CALL(cudaMalloc((void**)&cngfDevScnPairX, unSizeData*sizeof(float))); CUDA_SAFE_CALL(cudaMalloc((void**)&cngfDevScnPairY, unSizeData*sizeof(float))); CUDA_SAFE_CALL(cudaMalloc((void**)&cngfDevScnPairZ, unSizeData*sizeof(float))); /////////////// CUDA_SAFE_CALL(cudaMalloc((void**)&fCenModX, unSizeData*sizeof(float))); CUDA_SAFE_CALL(cudaMalloc((void**)&fCenModY, unSizeData*sizeof(float))); CUDA_SAFE_CALL(cudaMalloc((void**)&fCenModZ, unSizeData*sizeof(float))); CUDA_SAFE_CALL(cudaMalloc((void**)&fCenScnX, unSizeData*sizeof(float))); CUDA_SAFE_CALL(cudaMalloc((void**)&fCenScnY, unSizeData*sizeof(float))); CUDA_SAFE_CALL(cudaMalloc((void**)&fCenScnZ, unSizeData*sizeof(float))); CUDA_SAFE_CALL(cudaMalloc((void**)&unNoPairs, sizeof(unsigned))); CUDA_SAFE_CALL(cudaMalloc((void**)&fDevScnX, unSizeData*sizeof(float))); CUDA_SAFE_CALL(cudaMalloc((void**)&fDevScnY, unSizeData*sizeof(float))); CUDA_SAFE_CALL(cudaMalloc((void**)&fDevScnZ, unSizeData*sizeof(float))); CUDA_SAFE_CALL(cudaMalloc((void**)&fDevSplit, _unSizeTree*sizeof(float))); CUDA_SAFE_CALL(cudaMalloc((void**)&unDevIdx, _unSizeTree*sizeof(unsigned))); CUDA_SAFE_CALL(cudaMalloc((void**)&unDevAxis, _unSizeTree*sizeof(unsigned))); CUDA_SAFE_CALL(cudaMalloc((void**)&bDevIsLeaf, _unSizeTree*sizeof(bool))); CUDA_SAFE_CALL(cudaMalloc((void**)&fDevLoBound, _unSizeTree*sizeof(float))); CUDA_SAFE_CALL(cudaMalloc((void**)&fDevHiBound, _unSizeTree*sizeof(float))); CUDA_SAFE_CALL(cudaMallocArray(&cuArray, &cuDesc, _unWidth, _unHeight)); //to be bound to texture // Initialize states fMaxProcTime = 0.0f; fMaxDeviation = 0.0f; _fSearchRadiusMax = 0.0f; _fSearchRadiusMin = 0.0f; _fRadiusStep = 0.0f; _unNoQSizeStep = 0; _dElapsedTime = 0.0; /* * Array of ones to be used instead of abs sum */ cudaMallocHost((void**)&temp_ones, unSizeData*sizeof(float)); cudaMalloc((void**)&ones, unSizeData*sizeof(float)); // Array of ones for(int i = 0; i < unSizeData ; ++i)temp_ones[i] = 1.0f; cudaMemcpy(ones, temp_ones, unSizeData*sizeof(float), cudaMemcpyHostToDevice); } CIcpGpuCuda::~CIcpGpuCuda(){ ///////////// // tidy up ///////////// CUDA_SAFE_CALL(cudaUnbindTexture(refTex)); CUDA_SAFE_CALL(cudaFreeArray(cuArray)); CUDA_SAFE_CALL(cudaFree(fDevSplit)); CUDA_SAFE_CALL(cudaFree(unDevIdx)); CUDA_SAFE_CALL(cudaFree(unDevAxis)); CUDA_SAFE_CALL(cudaFree(bDevIsLeaf)); CUDA_SAFE_CALL(cudaFree(fDevLoBound)); CUDA_SAFE_CALL(cudaFree(fDevHiBound)); CUDA_SAFE_CALL(cudaFree(fDevScnX)); CUDA_SAFE_CALL(cudaFree(fDevScnY)); CUDA_SAFE_CALL(cudaFree(fDevScnZ)); CUDA_SAFE_CALL(cudaFree(fDist)); CUDA_SAFE_CALL(cudaFree(fDistCpt)); CUDA_SAFE_CALL(cudaFree(fCenModX)); CUDA_SAFE_CALL(cudaFree(fCenModY)); CUDA_SAFE_CALL(cudaFree(fCenModZ)); CUDA_SAFE_CALL(cudaFree(fCenScnX)); CUDA_SAFE_CALL(cudaFree(fCenScnY)); CUDA_SAFE_CALL(cudaFree(fCenScnZ)); CUDA_SAFE_CALL(cudaFree(unMask)); CUDA_SAFE_CALL(cudaFree(fDevMdlPairX)); CUDA_SAFE_CALL(cudaFree(fDevMdlPairY)); CUDA_SAFE_CALL(cudaFree(fDevMdlPairZ)); CUDA_SAFE_CALL(cudaFree(fDevScnPairX)); CUDA_SAFE_CALL(cudaFree(fDevScnPairY)); CUDA_SAFE_CALL(cudaFree(fDevScnPairZ)); CUDA_SAFE_CALL(cudaFree(cngfDevScnX)); CUDA_SAFE_CALL(cudaFree(cngfDevScnY)); CUDA_SAFE_CALL(cudaFree(cngfDevScnZ)); CUDA_SAFE_CALL(cudaFree(cngfDevMdlPairX)); CUDA_SAFE_CALL(cudaFree(cngfDevMdlPairY)); CUDA_SAFE_CALL(cudaFree(cngfDevMdlPairZ)); CUDA_SAFE_CALL(cudaFree(cngfDevScnPairX)); CUDA_SAFE_CALL(cudaFree(cngfDevScnPairY)); CUDA_SAFE_CALL(cudaFree(cngfDevScnPairZ)); CUDA_SAFE_CALL(cudaFree(unNoPairs)); CUDA_SAFE_CALL(cudaFree(ones)); CUDA_SAFE_CALL(cudaFreeHost(fSplit)); CUDA_SAFE_CALL(cudaFreeHost(unIdx)); CUDA_SAFE_CALL(cudaFreeHost(unAxis)); CUDA_SAFE_CALL(cudaFreeHost(bIsLeaf)); CUDA_SAFE_CALL(cudaFreeHost(fHstScnX)); CUDA_SAFE_CALL(cudaFreeHost(fHstScnY)); CUDA_SAFE_CALL(cudaFreeHost(fHstScnZ)); CUDA_SAFE_CALL(cudaFreeHost(fLoBound)); CUDA_SAFE_CALL(cudaFreeHost(fHiBound)); CUDA_SAFE_CALL(cudaFreeHost(pNoPairs)); CUDA_SAFE_CALL(cudaFreeHost(f4Mdl)); CUDA_SAFE_CALL(cudaFreeHost(temp_ones)); free(h_idata); ///////// // Exit ///////// bool bShutDownSuccess = true; // Done with CUDPP result = cudppDestroyPlan(compactplan); if (CUDPP_SUCCESS != result){ printf("Error destroying CUDPPPlan\n"); bShutDownSuccess = false; } // Done with CUBLAS cublasStatus statusCUBLAS = cublasShutdown(); if (statusCUBLAS != CUBLAS_STATUS_SUCCESS) { fprintf (stderr, "!!!! shutdown error (A)\n"); bShutDownSuccess = false; } // Done with ANN annClose(); } void CIcpGpuCuda::setResolution(unsigned unWidth, unsigned unHeight){ if (unWidth>0 && unHeight>0) { _unWidth = unWidth; _unHeight = unHeight; unSizeData = _unWidth*_unHeight; //configure block and grid size unsigned unBlockSize = (unsigned)BLOCKSIZE; //192 if(unSizeData<=64){ unNoThreads = 64; unNoBlocks = 1; } else if(unSizeData<=128){ unNoThreads = 128; unNoBlocks = 1; } else if(unSizeData<=unBlockSize){ unNoThreads = unBlockSize; unNoBlocks = 1; } else{ unNoThreads = unBlockSize; if(unSizeData%unBlockSize) unNoBlocks = unSizeData/unBlockSize + 1; else unNoBlocks = unSizeData/unBlockSize; } cout<<"unNoThreads: "<0) unMaxIteration = unTimes; else{ cout<<"Error setting maximum iterations."<0) fMaxProcTime = dMilliseconds; else{ cout<<"Error setting maximum processing time."<0) fMaxDeviation = fDeviation; else{ cout<<"Error setting maximum deviation."<=fRadiusMin)&&(unIterations>0)){ _fSearchRadiusMax = fRadiusMax; _fSearchRadiusMin = fRadiusMin; _fRadiusStep = (_fSearchRadiusMax-_fSearchRadiusMin)/(float)unIterations; _unIterations = unIterations; _unNoQSizeStep = unIterations/(unsigned)NO_QSIZE; } else{ cout<<"Error setting search radius."<getStats(*st); int nDepth = st->depth; // cout<<"level of the tree: "<depth<<" (counted from 0)"<_unSizeTree) { cout << "Not enough memory for tree construction. Tree size must be smaller than " << unSizeTree <getRoot(); if (unSizeData>1) { rearrange(pRoot, 1); } else { cout<<"Not enough points in the tree."<isLeaf()) { bIsLeaf[unStart-1] = true; unIdx[unStart-1] = (unsigned)(((ANNkd_leaf*)root)->getIdxArray())[0]; } else { ANNkd_ptr nL = ((ANNkd_split*)root)->getLeftChild(); ANNkd_ptr nR = ((ANNkd_split*)root)->getRightChild(); bIsLeaf[unStart-1] = false; fSplit[unStart-1] = (float)((ANNkd_split*)root)->getCutVal(); unAxis[unStart-1] = (unsigned)((ANNkd_split*)root)->getCutDim(); fLoBound[unStart-1] = ((ANNkd_split*)root)->getLoBound(); fHiBound[unStart-1] = ((ANNkd_split*)root)->getHiBound(); rearrange(nL, unStart*2); rearrange(nR, unStart*2+1); } } } void CIcpGpuCuda::setPointClouds(void){ setTree(); setModel(); setScene(); } void CIcpGpuCuda::iteration(){ ////////////////////////// // prepare for iteration ////////////////////////// EnumIcpState icpStat = ICP_PROCESSING; float fSearchRadius = _fSearchRadiusMax; unsigned unNoIter = 0; float fDeviation = 0.0; unsigned unQStep = 0; final_matrix = new Matrix(4,4); (*final_matrix)(1,1) = 1.0;(*final_matrix)(1,2) = 0.0;(*final_matrix)(1,3) = 0.0;(*final_matrix)(1,4) = 0.0; (*final_matrix)(2,1) = 0.0;(*final_matrix)(2,2) = 1.0;(*final_matrix)(2,3) = 0.0;(*final_matrix)(2,4) = 0.0; (*final_matrix)(3,1) = 0.0;(*final_matrix)(3,2) = 0.0;(*final_matrix)(3,3) = 1.0;(*final_matrix)(3,4) = 0.0; (*final_matrix)(4,1) = 0.0;(*final_matrix)(4,2) = 0.0;(*final_matrix)(4,3) = 0.0;(*final_matrix)(4,4) = 1.0; Matrix matrix(4,4); init_time = clock(); // The main loop of ICP while(icpStat == ICP_PROCESSING){ if (unNoIter <= _unIterations) unQStep = unNoIter/_unNoQSizeStep; findNearestNeighbors(fSearchRadius, unQStep); result=cudppCompact(compactplan, fDistCpt, (size_t*)unNoPairs, fDist, unMask, (size_t)unSizeData); if (CUDPP_SUCCESS != result) printf("Error cudppCompact\n"); CUDA_SAFE_CALL(cudaMemcpy(pNoPairs, unNoPairs, sizeof(unsigned), cudaMemcpyDeviceToHost)); unPairs = *pNoPairs; if (unPairs) { ////////////////////// // transform estimation ////////////////////// // Compute centroids (assume all data are non-negative) float *fCm = new float(3); float *fCs = new float(3); computeCentroid(fDevMdlPairX,fDevMdlPairY,fDevMdlPairZ, fCm); computeCentroid(fDevScnPairX,fDevScnPairY,fDevScnPairZ, fCs); fDeviation = cublasSdot(unSizeData,fDistCpt,1,ones,1); fDeviation /= unPairs; getCublasErr(); // check for termination conditions unNoIter++; if(unNoIter<_unIterations) fSearchRadius-=_fRadiusStep; if(fDeviation <= fMaxDeviation) icpStat = ICP_SUCCESS; else if(unNoIter >= unMaxIteration + 1) // unNoIter starts from 1 icpStat = ICP_MAXITERATIONS; else if ( (double)(clock() - init_time)/ (double)CLOCKS_PER_SEC * 1000>= fMaxProcTime ) icpStat = ICP_TIMEELAPSED; else { //Calculate centered point pairs class_centralize(unMask, fDevMdlPairX,fDevMdlPairY,fDevMdlPairZ,fDevScnPairX,fDevScnPairY,fDevScnPairZ, fCm[0],fCm[1],fCm[2],fCs[0],fCs[1],fCs[2], fCenModX,fCenModY,fCenModZ,fCenScnX,fCenScnY,fCenScnZ); //Fill H matrix Matrix H(3,3); H = computeHMatrix(); //SVD Matrix U(3,3); DiagonalMatrix Lamda(3); Matrix V(3,3); SVD(H,Lamda,U,V); //Get rotation Matrix R(3,3); R = V*(U.t()); // Calculate translation double dTranslation[3]; ColumnVector col_vec(3); for(unsigned j = 0; j < 3; j++) col_vec(j+1) = fCs[j]; ColumnVector r_time_colVec = ColumnVector(R*col_vec); dTranslation[0] = fCm[0] - r_time_colVec(1); dTranslation[1] = fCm[1] - r_time_colVec(2); dTranslation[2] = fCm[2] - r_time_colVec(3); matrix = fillHomoMatrix(&R,dTranslation); *final_matrix = matrix * (*final_matrix); for(int i = 1 ; i < 5 ; ++i) for(int j = 1; j < 5 ; ++j){ (*matrices[unNoIter - 1])(i,j) = (matrix)(i,j); } ///////////// // transform ///////////// class_transformation(fDevScnX, fDevScnY, fDevScnZ, (float)matrix(1,1), (float)matrix(1,2), (float)matrix(1,3), (float)matrix(1,4), (float)matrix(2,1), (float)matrix(2,2), (float)matrix(2,3), (float)matrix(2,4), (float)matrix(3,1), (float)matrix(3,2), (float)matrix(3,3), (float)matrix(3,4)); /////////////// // termination /////////////// } }//if(unPairs) else icpStat = ICP_NOTMATCHABLE; }//while(icpStat == ICP_PROCESSING) _dElapsedTime = (double)(clock() - init_time)/(double)CLOCKS_PER_SEC * 1000.0; //temporary cout<<"=========="<