/** @file * @brief GPU-ICP Algorithm * @author Deyuan Qiu, University of Applied Sciences Bonn-Rhein-Sieg, Sankt Augustin, Germany. * Fraunhofer IAIS, Sankt Augustin, Germany. */ #ifndef CICPGPUCUDA_H #define CICPGPUCUDA_H #include // C standard library #include // C I/O (for sscanf) #include // string manipulation #include // file I/O #include // min() #include #include "ANN/ANN.h" // ANN declarations #include "ANN/ANNperf.h" // kd-tree printing #include "kd_tree.h" // ANN node declaration #include "newmat/newmat.h" #include "newmat/newmatap.h" using namespace NEWMAT; #include "slam6d/cuda/CSystem.h" /* * The maximum block size. For nVidia G80 architecture, 192 is suggested. */ //#define BLOCKSIZE 512 //#define BLOCKSIZE 256 #define BLOCKSIZE 64 //#define BLOCKSIZE 1 /* * The big enough size of the AoS that is going to allocate for the kd-tree. */ //#define TREESIZE 262143 #define TREESIZE 524288 #ifdef use_namespace using namespace std; #endif /* * @class CIcpGpuCuda * @brief Iterative Closest Point algorithm is implemented on a programmable graphic * device. Kernels are implemented by CUDA (Compute Unified Device Architecture) GPGPU * programming language (http://www.nvidia.com/object/cuda_home.html). To compile the code, * nvcc compiler and related CUDA libraries must be installed. Kernel files are wrapped in * CIcpGpuCuda_kernel.cuh and CIcpGpuCuda.cu. Attention: only NVidia GeForce G80 architecture * and onwards graphic devices are garanteed to be supported. * @author Deyuan Qiu * @date 2008.Nov. */ class CIcpGpuCuda{ public: /* * standard constructor * @param argc passed from application main function. * @param argv passed from application main function. * @param unWidth the width of point cloud image. * @param unHeight the height of pint cloud image. */ CIcpGpuCuda(unsigned unWidth, unsigned unHeight, unsigned max_iter){ init(unWidth, unHeight,max_iter); } /* * standard destructor */ ~CIcpGpuCuda(); /* * Set maximum iteration for ICP. * @param unTimes number of times ICP iterates maximally. */ void setMaxIteration(unsigned unTimes); /* * Set maximum processing time for ICP, in milliseconds. * @param dMilliseconds ICP quits when elapsed time exceeds. */ void setMaxProcTime(double dMilliseconds); /* * Set maximum deviation for ICP. * @param dDeviation ICP quits when specified deviation is achieved. */ void setMaxDeviation(double dDeviation); /* * Set search radiuses for ICP. An iterative decreasing radius * is applied. Search radius decreases linearly from fRadiusMax to * fRadiusMin within unIterations iterations. * @param fRadiusMax Initial radius. * @param fRadiusMin Final radius. * @param unIterations Number of iterations, in which radius decreases. */ void setSearchRadius(float fRadiusMax, float fRadiusMin, unsigned unIterations); /* * Get the number of points in point cloud. Model point cloud * and scene point cloud must have the same number of points. * @return Number of points in point cloud. */ unsigned getSize(void); /* * Get the the 2 dimensional pointer to the scene point cloud. Page-lock memory is * allocated and freed by the class. The data type must be casted to single pricision * float, and the array should be loaded as [3][N]. * @return 2 dimensional pointer to scene point cloud. */ float** getScenePointer(void); /* * Get the the 2 dimensional pointer to the model point cloud. Memory allocation * and freeing is handled by the class. The data type must be casted to double pricision * float, and the array should be loaded as [N][3]. Notice the difference to * getScenePointer(). * @return 2 dimensional pointer to scene point cloud. */ double** getModelPointer(void); /* * The method is called after point clouds are loaded, and before iteration() is called. */ void setPointClouds(void); /* * ICP iterations. */ void iteration(void); /* * Get the transformation matrix. * @return The transformation matrix. */ Matrix* getMatrix(void); /** * These two functions are to set and get the tree pointer * It is supposed to be created in scan file and passed to this class **/ void setTreePointer(ANNkd_tree *); void getTreePointer(ANNkd_tree *&); double getTime(void); void setMinimums(float x, float y, float z); Matrix** getMatrices(); void setTrans_Trans_inv(const double[], const double[]); void findNearestNeighbors(float, unsigned); Matrix fillHomoMatrix(Matrix* , double*); void computeCentroid(float*, float*, float*, float *&); Matrix computeHMatrix(); void printMatrix(Matrix *); void setSize(unsigned int width, unsigned int height); private: //////////////// //cpu variables //////////////// /* * Initialization. Memories are allocated and default environmental state * is set. Called by standard constructor. * @param unWidth the width of point cloud image. * @param unHeight the height of pint cloud image. */ void init(unsigned unWidth, unsigned unHeight, unsigned max_iter); /* * Initialization. Memories are allocated and default environmental state * is set. Called by init(). * @param unWidth the width of point cloud image. * @param unHeight the height of pint cloud image. */ void setResolution(unsigned unWidth, unsigned unHeight); /* * The internal search structure is set. */ void setTree(void); /* * Set the model point cloud. */ void setModel(void); /* * Set the scene point cloud. */ void setScene(void); /* * Calculate the size of the search structure from its depth. The depth * is the number of levels of the tree. The search structure then is arraged * into a struture of arrays (SoA). A left-balanced binary tree is suggested. * @param nDepth Depth of the kd-tree. */ inline int depth2size(int nDepth); /* * Rearrange the search structure into a structure of arrays (SoA). The rule * is: the N node's left child has the index of 2N, while the right child 2N+1. * Arrangement is fulfilled in recursion. * @param root pointer to the root node * @param unStart the currenet node */ void rearrange(ANNkd_ptr root, unsigned unStart); /* * Get CUBLAS errors before this line. */ void getCublasErr(void); /* * Get CUDA errors before this line. */ void getCudaErr(void); /* * tree: structure of arrays (SoA) */ unsigned _unSizeTree; //Size of the allocated memory. float* fSplit; unsigned* unIdx; unsigned* unAxis; bool* bIsLeaf; float* fLoBound; float* fHiBound; unsigned unSizeTree; //Size of memory that the tree actually takes. /* * transformation matrix */ float m[16]; Matrix* trans; Matrix* trans_inv; /* * kernel constants */ unsigned unSizeData; unsigned unNoThreads; unsigned unNoBlocks; unsigned _unWidth; unsigned _unHeight; /* * search structure */ ANNkd_tree* kdTree; ANNkdStats* st; /* * data pointers */ float* fHstScn[3]; float* fHstScnX; float* fHstScnY; float* fHstScnZ; double** h_idata; /* * icp */ unsigned unMaxIteration; unsigned _unIterations; float fMaxProcTime; float fMaxDeviation; float _fSearchRadiusMax; float _fSearchRadiusMin; float _fRadiusStep; unsigned _unNoQSizeStep; float* temp_ones; float* ones; enum EnumIcpState { ICP_LIMIT = 0, ICP_PROCESSING = 1, ICP_NOTMATCHABLE = 2, ICP_MAXITERATIONS = 3, ICP_TIMEELAPSED = 4, ICP_SUCCESS = 5 }; unsigned unPairs; unsigned* pNoPairs; Matrix* final_matrix; double _dElapsedTime; Matrix** matrices; /* Minimums of all values */ float min_x; float min_y; float min_z; clock_t init_time; // To save the starting point of the timer Added by Shams //////////////// //gpu variables //////////////// float* fDevSplit; unsigned* unDevIdx; unsigned* unDevAxis; bool* bDevIsLeaf; float* fDevLoBound; float* fDevHiBound; float* fDevScnX; float* fDevScnY; float* fDevScnZ; float* fDist; float* fDistCpt; //compacted distance list unsigned* unMask; float* fDevMdlPairX; float* fDevMdlPairY; float* fDevMdlPairZ; float* fDevScnPairX; float* fDevScnPairY; float* fDevScnPairZ; ///////////// float* cngfDevScnX; float* cngfDevScnY; float* cngfDevScnZ; float* cngfDevMdlPairX; float* cngfDevMdlPairY; float* cngfDevMdlPairZ; float* cngfDevScnPairX; float* cngfDevScnPairY; float* cngfDevScnPairZ; ///////////// float* fCenModX; float* fCenModY; float* fCenModZ; float* fCenScnX; float* fCenScnY; float* fCenScnZ; unsigned* unNoPairs; //kd-tree based nearest neighbor search, using a priority queue. void class_nns_priority( float* fDevScnX, //scene point cloud float* fDevScnY, float* fDevScnZ, float* fDist, //squared distance between pairs, for deviation calculation float* fDevSplit, //kd-tree: position of splitting plain (inner node) unsigned* unDevIdx, //kd-tree: index of point (leaf node) unsigned* unDevAxis, //kd-tree: axis where splitting plain locates (inner node) bool* bDevIsLeaf, //kd-tree: node type (both nodes) float* fDevLoBound, //kd-tree: lower bounding box (inner node) float* fDevHiBound, //kd-tree: higher bounding box (inner node) unsigned* unMask, //a 0-1 mask of pair and non-pairs. float* fDevMdlPairX, float* fDevMdlPairY, float* fDevMdlPairZ, float* fDevScnPairX, float* fDevScnPairY, float* fDevScnPairZ, float fSearchRadius, unsigned unSize, unsigned unWidth, unsigned unQStep); //for dubugging thread //centralize a pointcloud void class_centralize(unsigned* unMask, float* fDevMdlPairX, float* fDevMdlPairY, float* fDevMdlPairZ, float* fDevScnPairX, float* fDevScnPairY, float* fDevScnPairZ, float fcm0, float fcm1, float fcm2, float fcs0, float fcs1, float fcs2, float* fCenteredModX, //centered point cloud float* fCenteredModY, float* fCenteredModZ, float* fCenteredScnX, float* fCenteredScnY, float* fCenteredScnZ); //transform point cloud void class_transformation(float* fDevScnX, //piont cloud to be transformed float* fDevScnY, float* fDevScnZ, float m00, float m01, float m02, float m03, float m10, float m11, float m12, float m13, float m20, float m21, float m22, float m23); }; #endif