3dpcp/include/slam6d/cuda/CIcpGpuCuda.cuh
2012-09-16 14:33:11 +02:00

415 lines
10 KiB
Text
Executable file

/** @file
* @brief GPU-ICP Algorithm
* @author Deyuan Qiu, University of Applied Sciences Bonn-Rhein-Sieg, Sankt Augustin, Germany.
* Fraunhofer IAIS, Sankt Augustin, Germany.
*/
#ifndef CICPGPUCUDA_H
#define CICPGPUCUDA_H
#include <cstdlib> // C standard library
#include <cstdio> // C I/O (for sscanf)
#include <cstring> // string manipulation
#include <fstream> // file I/O
#include <algorithm> // min()
#include <time.h>
#include "ANN/ANN.h" // ANN declarations
#include "ANN/ANNperf.h" // kd-tree printing
#include "kd_tree.h" // ANN node declaration
#include "newmat/newmat.h"
#include "newmat/newmatap.h"
using namespace NEWMAT;
#include "slam6d/cuda/CSystem.h"
/*
* The maximum block size. For nVidia G80 architecture, 192 is suggested.
*/
//#define BLOCKSIZE 512
//#define BLOCKSIZE 256
#define BLOCKSIZE 64
//#define BLOCKSIZE 1
/*
* The big enough size of the AoS that is going to allocate for the kd-tree.
*/
//#define TREESIZE 262143
#define TREESIZE 524288
#ifdef use_namespace
using namespace std;
#endif
/*
* @class CIcpGpuCuda
* @brief Iterative Closest Point algorithm is implemented on a programmable graphic
* device. Kernels are implemented by CUDA (Compute Unified Device Architecture) GPGPU
* programming language (http://www.nvidia.com/object/cuda_home.html). To compile the code,
* nvcc compiler and related CUDA libraries must be installed. Kernel files are wrapped in
* CIcpGpuCuda_kernel.cuh and CIcpGpuCuda.cu. Attention: only NVidia GeForce G80 architecture
* and onwards graphic devices are garanteed to be supported.
* @author Deyuan Qiu
* @date 2008.Nov.
*/
class CIcpGpuCuda{
public:
/*
* standard constructor
* @param argc passed from application main function.
* @param argv passed from application main function.
* @param unWidth the width of point cloud image.
* @param unHeight the height of pint cloud image.
*/
CIcpGpuCuda(unsigned unWidth, unsigned unHeight, unsigned max_iter){
init(unWidth, unHeight,max_iter);
}
/*
* standard destructor
*/
~CIcpGpuCuda();
/*
* Set maximum iteration for ICP.
* @param unTimes number of times ICP iterates maximally.
*/
void setMaxIteration(unsigned unTimes);
/*
* Set maximum processing time for ICP, in milliseconds.
* @param dMilliseconds ICP quits when elapsed time exceeds.
*/
void setMaxProcTime(double dMilliseconds);
/*
* Set maximum deviation for ICP.
* @param dDeviation ICP quits when specified deviation is achieved.
*/
void setMaxDeviation(double dDeviation);
/*
* Set search radiuses for ICP. An iterative decreasing radius
* is applied. Search radius decreases linearly from fRadiusMax to
* fRadiusMin within unIterations iterations.
* @param fRadiusMax Initial radius.
* @param fRadiusMin Final radius.
* @param unIterations Number of iterations, in which radius decreases.
*/
void setSearchRadius(float fRadiusMax, float fRadiusMin, unsigned unIterations);
/*
* Get the number of points in point cloud. Model point cloud
* and scene point cloud must have the same number of points.
* @return Number of points in point cloud.
*/
unsigned getSize(void);
/*
* Get the the 2 dimensional pointer to the scene point cloud. Page-lock memory is
* allocated and freed by the class. The data type must be casted to single pricision
* float, and the array should be loaded as [3][N].
* @return 2 dimensional pointer to scene point cloud.
*/
float** getScenePointer(void);
/*
* Get the the 2 dimensional pointer to the model point cloud. Memory allocation
* and freeing is handled by the class. The data type must be casted to double pricision
* float, and the array should be loaded as [N][3]. Notice the difference to
* getScenePointer().
* @return 2 dimensional pointer to scene point cloud.
*/
double** getModelPointer(void);
/*
* The method is called after point clouds are loaded, and before iteration() is called.
*/
void setPointClouds(void);
/*
* ICP iterations.
*/
void iteration(void);
/*
* Get the transformation matrix.
* @return The transformation matrix.
*/
Matrix* getMatrix(void);
/**
* These two functions are to set and get the tree pointer
* It is supposed to be created in scan file and passed to this class
**/
void setTreePointer(ANNkd_tree *);
void getTreePointer(ANNkd_tree *&);
double getTime(void);
void setMinimums(float x, float y, float z);
Matrix** getMatrices();
void setTrans_Trans_inv(const double[], const double[]);
void findNearestNeighbors(float, unsigned);
Matrix fillHomoMatrix(Matrix* , double*);
void computeCentroid(float*, float*, float*, float *&);
Matrix computeHMatrix();
void printMatrix(Matrix *);
void setSize(unsigned int width, unsigned int height);
private:
////////////////
//cpu variables
////////////////
/*
* Initialization. Memories are allocated and default environmental state
* is set. Called by standard constructor.
* @param unWidth the width of point cloud image.
* @param unHeight the height of pint cloud image.
*/
void init(unsigned unWidth, unsigned unHeight, unsigned max_iter);
/*
* Initialization. Memories are allocated and default environmental state
* is set. Called by init().
* @param unWidth the width of point cloud image.
* @param unHeight the height of pint cloud image.
*/
void setResolution(unsigned unWidth, unsigned unHeight);
/*
* The internal search structure is set.
*/
void setTree(void);
/*
* Set the model point cloud.
*/
void setModel(void);
/*
* Set the scene point cloud.
*/
void setScene(void);
/*
* Calculate the size of the search structure from its depth. The depth
* is the number of levels of the tree. The search structure then is arraged
* into a struture of arrays (SoA). A left-balanced binary tree is suggested.
* @param nDepth Depth of the kd-tree.
*/
inline int depth2size(int nDepth);
/*
* Rearrange the search structure into a structure of arrays (SoA). The rule
* is: the N node's left child has the index of 2N, while the right child 2N+1.
* Arrangement is fulfilled in recursion.
* @param root pointer to the root node
* @param unStart the currenet node
*/
void rearrange(ANNkd_ptr root, unsigned unStart);
/*
* Get CUBLAS errors before this line.
*/
void getCublasErr(void);
/*
* Get CUDA errors before this line.
*/
void getCudaErr(void);
/*
* tree: structure of arrays (SoA)
*/
unsigned _unSizeTree; //Size of the allocated memory.
float* fSplit;
unsigned* unIdx;
unsigned* unAxis;
bool* bIsLeaf;
float* fLoBound;
float* fHiBound;
unsigned unSizeTree; //Size of memory that the tree actually takes.
/*
* transformation matrix
*/
float m[16];
Matrix* trans;
Matrix* trans_inv;
/*
* kernel constants
*/
unsigned unSizeData;
unsigned unNoThreads;
unsigned unNoBlocks;
unsigned _unWidth;
unsigned _unHeight;
/*
* search structure
*/
ANNkd_tree* kdTree;
ANNkdStats* st;
/*
* data pointers
*/
float* fHstScn[3];
float* fHstScnX;
float* fHstScnY;
float* fHstScnZ;
double** h_idata;
/*
* icp
*/
unsigned unMaxIteration;
unsigned _unIterations;
float fMaxProcTime;
float fMaxDeviation;
float _fSearchRadiusMax;
float _fSearchRadiusMin;
float _fRadiusStep;
unsigned _unNoQSizeStep;
float* temp_ones;
float* ones;
enum EnumIcpState { ICP_LIMIT = 0,
ICP_PROCESSING = 1,
ICP_NOTMATCHABLE = 2,
ICP_MAXITERATIONS = 3,
ICP_TIMEELAPSED = 4,
ICP_SUCCESS = 5 };
unsigned unPairs;
unsigned* pNoPairs;
Matrix* final_matrix;
double _dElapsedTime;
Matrix** matrices;
/*
Minimums of all values
*/
float min_x;
float min_y;
float min_z;
clock_t init_time; // To save the starting point of the timer Added by Shams
////////////////
//gpu variables
////////////////
float* fDevSplit;
unsigned* unDevIdx;
unsigned* unDevAxis;
bool* bDevIsLeaf;
float* fDevLoBound;
float* fDevHiBound;
float* fDevScnX;
float* fDevScnY;
float* fDevScnZ;
float* fDist;
float* fDistCpt; //compacted distance list
unsigned* unMask;
float* fDevMdlPairX;
float* fDevMdlPairY;
float* fDevMdlPairZ;
float* fDevScnPairX;
float* fDevScnPairY;
float* fDevScnPairZ;
/////////////
float* cngfDevScnX;
float* cngfDevScnY;
float* cngfDevScnZ;
float* cngfDevMdlPairX;
float* cngfDevMdlPairY;
float* cngfDevMdlPairZ;
float* cngfDevScnPairX;
float* cngfDevScnPairY;
float* cngfDevScnPairZ;
/////////////
float* fCenModX;
float* fCenModY;
float* fCenModZ;
float* fCenScnX;
float* fCenScnY;
float* fCenScnZ;
unsigned* unNoPairs;
//kd-tree based nearest neighbor search, using a priority queue.
void class_nns_priority(
float* fDevScnX, //scene point cloud
float* fDevScnY,
float* fDevScnZ,
float* fDist, //squared distance between pairs, for deviation calculation
float* fDevSplit, //kd-tree: position of splitting plain (inner node)
unsigned* unDevIdx, //kd-tree: index of point (leaf node)
unsigned* unDevAxis, //kd-tree: axis where splitting plain locates (inner node)
bool* bDevIsLeaf, //kd-tree: node type (both nodes)
float* fDevLoBound, //kd-tree: lower bounding box (inner node)
float* fDevHiBound, //kd-tree: higher bounding box (inner node)
unsigned* unMask, //a 0-1 mask of pair and non-pairs.
float* fDevMdlPairX,
float* fDevMdlPairY,
float* fDevMdlPairZ,
float* fDevScnPairX,
float* fDevScnPairY,
float* fDevScnPairZ,
float fSearchRadius,
unsigned unSize,
unsigned unWidth,
unsigned unQStep); //for dubugging thread
//centralize a pointcloud
void class_centralize(unsigned* unMask,
float* fDevMdlPairX,
float* fDevMdlPairY,
float* fDevMdlPairZ,
float* fDevScnPairX,
float* fDevScnPairY,
float* fDevScnPairZ,
float fcm0,
float fcm1,
float fcm2,
float fcs0,
float fcs1,
float fcs2,
float* fCenteredModX, //centered point cloud
float* fCenteredModY,
float* fCenteredModZ,
float* fCenteredScnX,
float* fCenteredScnY,
float* fCenteredScnZ);
//transform point cloud
void class_transformation(float* fDevScnX, //piont cloud to be transformed
float* fDevScnY,
float* fDevScnZ,
float m00, float m01, float m02, float m03,
float m10, float m11, float m12, float m13,
float m20, float m21, float m22, float m23);
};
#endif