3dpcp/.svn/pristine/b6/b65e2a04a83829a4c44ec9d05573f013548109b2.svn-base

870 lines
32 KiB
Text
Raw Normal View History

2012-09-16 12:33:11 +00:00
/** @file
* @brief GPU-ICP Algorithm
* @author Deyuan Qiu, University of Applied Sciences Bonn-Rhein-Sieg, Sankt Augustin, Germany.
* Fraunhofer IAIS, Sankt Augustin, Germany.
*/
#include "slam6d/cuda/CIcpGpuCuda_kernel.cuh"
#include "slam6d/cuda/CIcpGpuCuda.cuh"
#include "slam6d/cuda/CSystem.h"
void CIcpGpuCuda::init(unsigned unWidth, unsigned unHeight, unsigned max_iter)
{
// Initialize CUTIL
int d;
cudaSetDevice(0); // Since we have only one GPU I didn't initialize anything
// It may cause problems if more GPUs take into account
// The reason of doing that was initialization of it several times
// Now it is not initialized several times and just once.
unMaxIteration = max_iter;
matrices = (Matrix**)malloc(sizeof(Matrix*) * max_iter);
for(int i = 0 ; i < max_iter ; ++i){
matrices[i] = new Matrix(4,4);
Matrix* m = matrices[i];
(*m)(1,1) = 1;(*m)(1,2) = 0;(*m)(1,3) = 0;(*m)(1,4) = 0;
(*m)(2,1) = 0;(*m)(2,2) = 1;(*m)(2,3) = 0;(*m)(2,4) = 0;
(*m)(3,1) = 0;(*m)(3,2) = 0;(*m)(3,3) = 1;(*m)(3,4) = 0;
(*m)(4,1) = 0;(*m)(4,2) = 0;(*m)(4,3) = 0;(*m)(4,4) = 1;
}
// set data size
setResolution(unWidth, unHeight);
// cout<<"unSizeData: "<<unSizeData<<endl;
// cout<<"Tree Size :" <<TREESIZE<<endl;
// Initialize CUBLAS
cublasStatus statusCUBLAS = cublasInit();
if (statusCUBLAS != CUBLAS_STATUS_SUCCESS) {
cout<<"The error status is \n";
cout<<statusCUBLAS<<endl;
fprintf (stderr, "!!!! CUBLAS initialization error\n");
exit(1);
}
// Initialize CUDPP
CUDPPConfiguration config;
config.datatype = CUDPP_FLOAT;
config.algorithm = CUDPP_COMPACT;
config.options = CUDPP_OPTION_FORWARD;
result = cudppPlan(&compactplan, config, unSizeData, 1, 0);
if (CUDPP_SUCCESS != result) printf("Error creating CUDPPPlan\n");
_unSizeTree = (unsigned)TREESIZE;
CUDA_SAFE_CALL(cudaMallocHost((void**)&fSplit, _unSizeTree*sizeof(float)));
CUDA_SAFE_CALL(cudaMallocHost((void**)&unIdx, _unSizeTree*sizeof(unsigned)));
CUDA_SAFE_CALL(cudaMallocHost((void**)&unAxis, _unSizeTree*sizeof(unsigned)));
CUDA_SAFE_CALL(cudaMallocHost((void**)&bIsLeaf, _unSizeTree*sizeof(bool)));
CUDA_SAFE_CALL(cudaMallocHost((void**)&fLoBound, _unSizeTree*sizeof(float)));
CUDA_SAFE_CALL(cudaMallocHost((void**)&fHiBound, _unSizeTree*sizeof(float)));
CUDA_SAFE_CALL(cudaMallocHost((void**)&f4Mdl, unSizeData*sizeof(float4))); // to be downloaded to texture
// Host memory allocation
CUDA_SAFE_CALL(cudaMallocHost((void**)&fHstScnX, unSizeData*sizeof(float)));
CUDA_SAFE_CALL(cudaMallocHost((void**)&fHstScnY, unSizeData *sizeof(float)));
CUDA_SAFE_CALL(cudaMallocHost((void**)&fHstScnZ, unSizeData*sizeof(float))); //scene
fHstScn[0]=fHstScnX;
fHstScn[1]=fHstScnY;
fHstScn[2]=fHstScnZ;
CUDA_SAFE_CALL(cudaMallocHost((void**)&pNoPairs, sizeof(unsigned)));
CSystem<double>::allocate(unSizeData, 3, h_idata); //model
// Device memory allocation
CUDA_SAFE_CALL(cudaMalloc((void**)&fDist, unSizeData*sizeof(float)));
CUDA_SAFE_CALL(cudaMalloc((void**)&fDistCpt, unSizeData*sizeof(float)));
CUDA_SAFE_CALL(cudaMalloc((void**)&unMask, unSizeData*sizeof(unsigned)));
CUDA_SAFE_CALL(cudaMalloc((void**)&fDevMdlPairX, unSizeData*sizeof(float))); //pairs after shrinking
CUDA_SAFE_CALL(cudaMalloc((void**)&fDevMdlPairY, unSizeData*sizeof(float)));
CUDA_SAFE_CALL(cudaMalloc((void**)&fDevMdlPairZ, unSizeData*sizeof(float)));
CUDA_SAFE_CALL(cudaMalloc((void**)&fDevScnPairX, unSizeData*sizeof(float)));
CUDA_SAFE_CALL(cudaMalloc((void**)&fDevScnPairY, unSizeData*sizeof(float)));
CUDA_SAFE_CALL(cudaMalloc((void**)&fDevScnPairZ, unSizeData*sizeof(float)));
/////////////// Added by Shams
CUDA_SAFE_CALL(cudaMalloc((void**)&cngfDevScnX,unSizeData*sizeof(float)));
CUDA_SAFE_CALL(cudaMalloc((void**)&cngfDevScnY,unSizeData*sizeof(float)));
CUDA_SAFE_CALL(cudaMalloc((void**)&cngfDevScnZ,unSizeData*sizeof(float)));
CUDA_SAFE_CALL(cudaMalloc((void**)&cngfDevMdlPairX, unSizeData*sizeof(float))); //pairs after shrinking
CUDA_SAFE_CALL(cudaMalloc((void**)&cngfDevMdlPairY, unSizeData*sizeof(float)));
CUDA_SAFE_CALL(cudaMalloc((void**)&cngfDevMdlPairZ, unSizeData*sizeof(float)));
CUDA_SAFE_CALL(cudaMalloc((void**)&cngfDevScnPairX, unSizeData*sizeof(float)));
CUDA_SAFE_CALL(cudaMalloc((void**)&cngfDevScnPairY, unSizeData*sizeof(float)));
CUDA_SAFE_CALL(cudaMalloc((void**)&cngfDevScnPairZ, unSizeData*sizeof(float)));
///////////////
CUDA_SAFE_CALL(cudaMalloc((void**)&fCenModX, unSizeData*sizeof(float)));
CUDA_SAFE_CALL(cudaMalloc((void**)&fCenModY, unSizeData*sizeof(float)));
CUDA_SAFE_CALL(cudaMalloc((void**)&fCenModZ, unSizeData*sizeof(float)));
CUDA_SAFE_CALL(cudaMalloc((void**)&fCenScnX, unSizeData*sizeof(float)));
CUDA_SAFE_CALL(cudaMalloc((void**)&fCenScnY, unSizeData*sizeof(float)));
CUDA_SAFE_CALL(cudaMalloc((void**)&fCenScnZ, unSizeData*sizeof(float)));
CUDA_SAFE_CALL(cudaMalloc((void**)&unNoPairs, sizeof(unsigned)));
CUDA_SAFE_CALL(cudaMalloc((void**)&fDevScnX, unSizeData*sizeof(float)));
CUDA_SAFE_CALL(cudaMalloc((void**)&fDevScnY, unSizeData*sizeof(float)));
CUDA_SAFE_CALL(cudaMalloc((void**)&fDevScnZ, unSizeData*sizeof(float)));
CUDA_SAFE_CALL(cudaMalloc((void**)&fDevSplit, _unSizeTree*sizeof(float)));
CUDA_SAFE_CALL(cudaMalloc((void**)&unDevIdx, _unSizeTree*sizeof(unsigned)));
CUDA_SAFE_CALL(cudaMalloc((void**)&unDevAxis, _unSizeTree*sizeof(unsigned)));
CUDA_SAFE_CALL(cudaMalloc((void**)&bDevIsLeaf, _unSizeTree*sizeof(bool)));
CUDA_SAFE_CALL(cudaMalloc((void**)&fDevLoBound, _unSizeTree*sizeof(float)));
CUDA_SAFE_CALL(cudaMalloc((void**)&fDevHiBound, _unSizeTree*sizeof(float)));
CUDA_SAFE_CALL(cudaMallocArray(&cuArray, &cuDesc, _unWidth, _unHeight)); //to be bound to texture
// Initialize states
fMaxProcTime = 0.0f;
fMaxDeviation = 0.0f;
_fSearchRadiusMax = 0.0f;
_fSearchRadiusMin = 0.0f;
_fRadiusStep = 0.0f;
_unNoQSizeStep = 0;
_dElapsedTime = 0.0;
/*
* Array of ones to be used instead of abs sum
*/
cudaMallocHost((void**)&temp_ones, unSizeData*sizeof(float));
cudaMalloc((void**)&ones, unSizeData*sizeof(float)); // Array of ones
for(int i = 0; i < unSizeData ; ++i)temp_ones[i] = 1.0f;
cudaMemcpy(ones, temp_ones, unSizeData*sizeof(float), cudaMemcpyHostToDevice);
}
CIcpGpuCuda::~CIcpGpuCuda(){
/////////////
// tidy up
/////////////
CUDA_SAFE_CALL(cudaUnbindTexture(refTex));
CUDA_SAFE_CALL(cudaFreeArray(cuArray));
CUDA_SAFE_CALL(cudaFree(fDevSplit));
CUDA_SAFE_CALL(cudaFree(unDevIdx));
CUDA_SAFE_CALL(cudaFree(unDevAxis));
CUDA_SAFE_CALL(cudaFree(bDevIsLeaf));
CUDA_SAFE_CALL(cudaFree(fDevLoBound));
CUDA_SAFE_CALL(cudaFree(fDevHiBound));
CUDA_SAFE_CALL(cudaFree(fDevScnX));
CUDA_SAFE_CALL(cudaFree(fDevScnY));
CUDA_SAFE_CALL(cudaFree(fDevScnZ));
CUDA_SAFE_CALL(cudaFree(fDist));
CUDA_SAFE_CALL(cudaFree(fDistCpt));
CUDA_SAFE_CALL(cudaFree(fCenModX));
CUDA_SAFE_CALL(cudaFree(fCenModY));
CUDA_SAFE_CALL(cudaFree(fCenModZ));
CUDA_SAFE_CALL(cudaFree(fCenScnX));
CUDA_SAFE_CALL(cudaFree(fCenScnY));
CUDA_SAFE_CALL(cudaFree(fCenScnZ));
CUDA_SAFE_CALL(cudaFree(unMask));
CUDA_SAFE_CALL(cudaFree(fDevMdlPairX));
CUDA_SAFE_CALL(cudaFree(fDevMdlPairY));
CUDA_SAFE_CALL(cudaFree(fDevMdlPairZ));
CUDA_SAFE_CALL(cudaFree(fDevScnPairX));
CUDA_SAFE_CALL(cudaFree(fDevScnPairY));
CUDA_SAFE_CALL(cudaFree(fDevScnPairZ));
CUDA_SAFE_CALL(cudaFree(cngfDevScnX));
CUDA_SAFE_CALL(cudaFree(cngfDevScnY));
CUDA_SAFE_CALL(cudaFree(cngfDevScnZ));
CUDA_SAFE_CALL(cudaFree(cngfDevMdlPairX));
CUDA_SAFE_CALL(cudaFree(cngfDevMdlPairY));
CUDA_SAFE_CALL(cudaFree(cngfDevMdlPairZ));
CUDA_SAFE_CALL(cudaFree(cngfDevScnPairX));
CUDA_SAFE_CALL(cudaFree(cngfDevScnPairY));
CUDA_SAFE_CALL(cudaFree(cngfDevScnPairZ));
CUDA_SAFE_CALL(cudaFree(unNoPairs));
CUDA_SAFE_CALL(cudaFree(ones));
CUDA_SAFE_CALL(cudaFreeHost(fSplit));
CUDA_SAFE_CALL(cudaFreeHost(unIdx));
CUDA_SAFE_CALL(cudaFreeHost(unAxis));
CUDA_SAFE_CALL(cudaFreeHost(bIsLeaf));
CUDA_SAFE_CALL(cudaFreeHost(fHstScnX));
CUDA_SAFE_CALL(cudaFreeHost(fHstScnY));
CUDA_SAFE_CALL(cudaFreeHost(fHstScnZ));
CUDA_SAFE_CALL(cudaFreeHost(fLoBound));
CUDA_SAFE_CALL(cudaFreeHost(fHiBound));
CUDA_SAFE_CALL(cudaFreeHost(pNoPairs));
CUDA_SAFE_CALL(cudaFreeHost(f4Mdl));
CUDA_SAFE_CALL(cudaFreeHost(temp_ones));
free(h_idata);
/////////
// Exit
/////////
bool bShutDownSuccess = true;
// Done with CUDPP
result = cudppDestroyPlan(compactplan);
if (CUDPP_SUCCESS != result){
printf("Error destroying CUDPPPlan\n");
bShutDownSuccess = false;
}
// Done with CUBLAS
cublasStatus statusCUBLAS = cublasShutdown();
if (statusCUBLAS != CUBLAS_STATUS_SUCCESS) {
fprintf (stderr, "!!!! shutdown error (A)\n");
bShutDownSuccess = false;
}
// Done with ANN
annClose();
}
void CIcpGpuCuda::setResolution(unsigned unWidth, unsigned unHeight){
if (unWidth>0 && unHeight>0) {
_unWidth = unWidth;
_unHeight = unHeight;
unSizeData = _unWidth*_unHeight;
//configure block and grid size
unsigned unBlockSize = (unsigned)BLOCKSIZE; //192
if(unSizeData<=64){
unNoThreads = 64;
unNoBlocks = 1;
}
else if(unSizeData<=128){
unNoThreads = 128;
unNoBlocks = 1;
}
else if(unSizeData<=unBlockSize){
unNoThreads = unBlockSize;
unNoBlocks = 1;
}
else{
unNoThreads = unBlockSize;
if(unSizeData%unBlockSize) unNoBlocks = unSizeData/unBlockSize + 1;
else unNoBlocks = unSizeData/unBlockSize;
}
cout<<"unNoThreads: "<<unNoThreads<<'\t'<<"unNoBlocks: "<<unNoBlocks<<endl;
dimBlock.x=unNoThreads;
dimBlock.y=1;
dimBlock.z=1;
dimGrid.x=unNoBlocks;
dimGrid.y=1;
dimGrid.z=1;
}
else{
cout<<"Each aspect of resolution must be bigger than 0."<<endl;
exit(1);
}
}
void CIcpGpuCuda::setMaxIteration(unsigned unTimes){
if(unTimes>0) unMaxIteration = unTimes;
else{
cout<<"Error setting maximum iterations."<<endl;
exit(1);
}
}
void CIcpGpuCuda::setMaxProcTime(double dMilliseconds){
if(dMilliseconds>0) fMaxProcTime = dMilliseconds;
else{
cout<<"Error setting maximum processing time."<<endl;
exit(1);
}
}
void CIcpGpuCuda::setMaxDeviation(double fDeviation){
if(fDeviation>0) fMaxDeviation = fDeviation;
else{
cout<<"Error setting maximum deviation."<<endl;
exit(1);
}
}
void CIcpGpuCuda::setSize(unsigned int width, unsigned int height){
//The memory allocation is once done for all scans
//we require to update the sizes for each scan matching process
//The size which is set in constructor is the maximum possible,
//and this size is associated to each pair.
setResolution(width, height);
cout<<"unSizeData: "<<unSizeData<<endl;
}
void CIcpGpuCuda::setSearchRadius(float fRadiusMax, float fRadiusMin, unsigned unIterations){
if((fRadiusMax>=fRadiusMin)&&(unIterations>0)){
_fSearchRadiusMax = fRadiusMax;
_fSearchRadiusMin = fRadiusMin;
_fRadiusStep = (_fSearchRadiusMax-_fSearchRadiusMin)/(float)unIterations;
_unIterations = unIterations;
_unNoQSizeStep = unIterations/(unsigned)NO_QSIZE;
}
else{
cout<<"Error setting search radius."<<endl;
exit(1);
}
}
float** CIcpGpuCuda::getScenePointer() {
return fHstScn;
}
double** CIcpGpuCuda::getModelPointer(void){
return h_idata;
}
unsigned CIcpGpuCuda::getSize(void){
return unSizeData;
}
void CIcpGpuCuda::setTreePointer(ANNkd_tree *tree){
kdTree = tree;
}
void CIcpGpuCuda::getTreePointer(ANNkd_tree *&tree ){
tree = kdTree;
}
void CIcpGpuCuda::setTree()
{
// preparation
st = new ANNkdStats();
kdTree->getStats(*st);
int nDepth = st->depth;
// cout<<"level of the tree: "<<st->depth<<" (counted from 0)"<<endl;
// decide size of array to be uploaded to GPU
unSizeTree = depth2size(nDepth);
// cout<<"unSizeTree: "<<unSizeTree<<endl;
if (unSizeTree>_unSizeTree) {
cout << "Not enough memory for tree construction. Tree size must be smaller than "
<< unSizeTree <<endl;
exit(1);
}
// rearrange
ANNkd_split* pRoot = (ANNkd_split*)kdTree->getRoot();
if (unSizeData>1) {
rearrange(pRoot, 1);
} else {
cout<<"Not enough points in the tree."<<endl;
exit(1);
}
// download the tree
CUDA_SAFE_CALL(cudaMemcpy(fDevSplit, fSplit, unSizeTree*sizeof(float), cudaMemcpyHostToDevice));
CUDA_SAFE_CALL(cudaMemcpy(unDevIdx, unIdx, unSizeTree*sizeof(unsigned), cudaMemcpyHostToDevice));
CUDA_SAFE_CALL(cudaMemcpy(unDevAxis, unAxis, unSizeTree*sizeof(unsigned), cudaMemcpyHostToDevice));
CUDA_SAFE_CALL(cudaMemcpy(bDevIsLeaf, bIsLeaf, unSizeTree*sizeof(bool), cudaMemcpyHostToDevice));
CUDA_SAFE_CALL(cudaMemcpy(fDevLoBound, fLoBound, unSizeTree*sizeof(float), cudaMemcpyHostToDevice));
CUDA_SAFE_CALL(cudaMemcpy(fDevHiBound, fHiBound, unSizeTree*sizeof(float), cudaMemcpyHostToDevice));
// clean up
// delete kdTree;
delete st;
}
void CIcpGpuCuda::setModel()
{
// using texture memory
for(unsigned i = 0; i < unSizeData; i++) { // type cast
f4Mdl[i].x=(float)h_idata[i][0];
f4Mdl[i].y=(float)h_idata[i][1];
f4Mdl[i].z=(float)h_idata[i][2];
}
cudaMemcpyToArray(cuArray,0,0,f4Mdl,unSizeData*sizeof(float4),cudaMemcpyHostToDevice);
cudaBindTextureToArray(refTex,cuArray);
}
void CIcpGpuCuda::setScene()
{
CUDA_SAFE_CALL(cudaMemcpy(fDevScnX, fHstScnX, unSizeData*sizeof(float), cudaMemcpyHostToDevice));
CUDA_SAFE_CALL(cudaMemcpy(fDevScnY, fHstScnY, unSizeData*sizeof(float), cudaMemcpyHostToDevice));
CUDA_SAFE_CALL(cudaMemcpy(fDevScnZ, fHstScnZ, unSizeData*sizeof(float), cudaMemcpyHostToDevice));
}
inline int CIcpGpuCuda::depth2size(int nDepth)
{
double nSize = 0;
for(int i = 0;i <= nDepth; i++)
{
nSize += pow(2.0,(double)i);
}
return (int)(nSize+0.5);
}
void CIcpGpuCuda::rearrange(ANNkd_ptr root, unsigned unStart)
{
if(root!=NULL && root!=KD_TRIVIAL){
if (root->isLeaf()) {
bIsLeaf[unStart-1] = true;
unIdx[unStart-1] = (unsigned)(((ANNkd_leaf*)root)->getIdxArray())[0];
}
else {
ANNkd_ptr nL = ((ANNkd_split*)root)->getLeftChild();
ANNkd_ptr nR = ((ANNkd_split*)root)->getRightChild();
bIsLeaf[unStart-1] = false;
fSplit[unStart-1] = (float)((ANNkd_split*)root)->getCutVal();
unAxis[unStart-1] = (unsigned)((ANNkd_split*)root)->getCutDim();
fLoBound[unStart-1] = ((ANNkd_split*)root)->getLoBound();
fHiBound[unStart-1] = ((ANNkd_split*)root)->getHiBound();
rearrange(nL, unStart*2);
rearrange(nR, unStart*2+1);
}
}
}
void CIcpGpuCuda::setPointClouds(void){
setTree();
setModel();
setScene();
}
void CIcpGpuCuda::iteration(){
//////////////////////////
// prepare for iteration
//////////////////////////
EnumIcpState icpStat = ICP_PROCESSING;
float fSearchRadius = _fSearchRadiusMax;
unsigned unNoIter = 0;
float fDeviation = 0.0;
unsigned unQStep = 0;
final_matrix = new Matrix(4,4);
(*final_matrix)(1,1) = 1.0;(*final_matrix)(1,2) = 0.0;(*final_matrix)(1,3) = 0.0;(*final_matrix)(1,4) = 0.0;
(*final_matrix)(2,1) = 0.0;(*final_matrix)(2,2) = 1.0;(*final_matrix)(2,3) = 0.0;(*final_matrix)(2,4) = 0.0;
(*final_matrix)(3,1) = 0.0;(*final_matrix)(3,2) = 0.0;(*final_matrix)(3,3) = 1.0;(*final_matrix)(3,4) = 0.0;
(*final_matrix)(4,1) = 0.0;(*final_matrix)(4,2) = 0.0;(*final_matrix)(4,3) = 0.0;(*final_matrix)(4,4) = 1.0;
Matrix matrix(4,4);
init_time = clock();
// The main loop of ICP
while(icpStat == ICP_PROCESSING){
if (unNoIter <= _unIterations)
unQStep = unNoIter/_unNoQSizeStep;
findNearestNeighbors(fSearchRadius, unQStep);
result=cudppCompact(compactplan, fDistCpt, (size_t*)unNoPairs,
fDist, unMask, (size_t)unSizeData);
if (CUDPP_SUCCESS != result)
printf("Error cudppCompact\n");
CUDA_SAFE_CALL(cudaMemcpy(pNoPairs, unNoPairs, sizeof(unsigned), cudaMemcpyDeviceToHost));
unPairs = *pNoPairs;
if (unPairs) {
//////////////////////
// transform estimation
//////////////////////
// Compute centroids (assume all data are non-negative)
float *fCm = new float(3);
float *fCs = new float(3);
computeCentroid(fDevMdlPairX,fDevMdlPairY,fDevMdlPairZ, fCm);
computeCentroid(fDevScnPairX,fDevScnPairY,fDevScnPairZ, fCs);
fDeviation = cublasSdot(unSizeData,fDistCpt,1,ones,1);
fDeviation /= unPairs;
getCublasErr();
// check for termination conditions
unNoIter++;
if(unNoIter<_unIterations)
fSearchRadius-=_fRadiusStep;
if(fDeviation <= fMaxDeviation)
icpStat = ICP_SUCCESS;
else if(unNoIter >= unMaxIteration + 1) // unNoIter starts from 1
icpStat = ICP_MAXITERATIONS;
else if ( (double)(clock() - init_time)/ (double)CLOCKS_PER_SEC * 1000>= fMaxProcTime )
icpStat = ICP_TIMEELAPSED;
else
{
//Calculate centered point pairs
class_centralize(unMask,
fDevMdlPairX,fDevMdlPairY,fDevMdlPairZ,fDevScnPairX,fDevScnPairY,fDevScnPairZ,
fCm[0],fCm[1],fCm[2],fCs[0],fCs[1],fCs[2],
fCenModX,fCenModY,fCenModZ,fCenScnX,fCenScnY,fCenScnZ);
//Fill H matrix
Matrix H(3,3);
H = computeHMatrix();
//SVD
Matrix U(3,3);
DiagonalMatrix Lamda(3);
Matrix V(3,3);
SVD(H,Lamda,U,V);
//Get rotation
Matrix R(3,3);
R = V*(U.t());
// Calculate translation
double dTranslation[3];
ColumnVector col_vec(3);
for(unsigned j = 0; j < 3; j++)
col_vec(j+1) = fCs[j];
ColumnVector r_time_colVec = ColumnVector(R*col_vec);
dTranslation[0] = fCm[0] - r_time_colVec(1);
dTranslation[1] = fCm[1] - r_time_colVec(2);
dTranslation[2] = fCm[2] - r_time_colVec(3);
matrix = fillHomoMatrix(&R,dTranslation);
*final_matrix = matrix * (*final_matrix);
for(int i = 1 ; i < 5 ; ++i)
for(int j = 1; j < 5 ; ++j){
(*matrices[unNoIter - 1])(i,j) = (matrix)(i,j);
}
/////////////
// transform
/////////////
class_transformation(fDevScnX, fDevScnY, fDevScnZ,
(float)matrix(1,1), (float)matrix(1,2), (float)matrix(1,3), (float)matrix(1,4),
(float)matrix(2,1), (float)matrix(2,2), (float)matrix(2,3), (float)matrix(2,4),
(float)matrix(3,1), (float)matrix(3,2), (float)matrix(3,3), (float)matrix(3,4));
///////////////
// termination
///////////////
}
}//if(unPairs)
else icpStat = ICP_NOTMATCHABLE;
}//while(icpStat == ICP_PROCESSING)
_dElapsedTime = (double)(clock() - init_time)/(double)CLOCKS_PER_SEC * 1000.0; //temporary
cout<<"=========="<<endl;
switch(icpStat)
{
case ICP_LIMIT:
cout<<"terminated: convergent limit reached."<<endl;
break;
case ICP_NOTMATCHABLE:
cout<<"terminated: point clounds not matchable."<<endl;
/*
(*final_matrix)(1,1) = 1.0;(*final_matrix)(1,2) = 0.0;(*final_matrix)(1,3) = 0.0;(*final_matrix)(1,4) = 0.0;
(*final_matrix)(2,1) = 0.0;(*final_matrix)(2,2) = 1.0;(*final_matrix)(2,3) = 0.0;(*final_matrix)(2,4) = 0.0;
(*final_matrix)(3,1) = 0.0;(*final_matrix)(3,2) = 0.0;(*final_matrix)(3,3) = 1.0;(*final_matrix)(3,4) = 0.0;
(*final_matrix)(4,1) = 0.0;(*final_matrix)(4,2) = 0.0;(*final_matrix)(4,3) = 0.0;(*final_matrix)(4,4) = 1.0;
*/
break;
case ICP_MAXITERATIONS:
cout<<"terminated: maximum iteration exceeds."<<endl;
break;
case ICP_TIMEELAPSED:
cout<<"terminated: maximum time elapsed."<<endl;
break;
case ICP_SUCCESS:
cout<<"succeeded: maximum deviation reached."<<endl;
break;
}
cout<<"elapsed time:\t"<<_dElapsedTime<<"ms"<<endl;
cout<<"iterations:\t"<<unNoIter - 1<<endl;
cout<<"deviation:\t"<<fDeviation<<endl;
}
double CIcpGpuCuda::getTime(void){
return _dElapsedTime;
}
Matrix* CIcpGpuCuda::getMatrix(void){
return final_matrix;
}
void CIcpGpuCuda::getCublasErr()
{
cublasStatus statusCUBLAS;
statusCUBLAS = cublasGetError();
if (statusCUBLAS != CUBLAS_STATUS_SUCCESS) {
cout<<"CUBLAS error: ";
switch(statusCUBLAS){
case CUBLAS_STATUS_NOT_INITIALIZED: cout<<"CUBLAS library not initialized"<<endl;break;
case CUBLAS_STATUS_ALLOC_FAILED: cout<<"resource allocation failed"<<endl;break;
case CUBLAS_STATUS_INVALID_VALUE: cout<<"unsupported numerical value was passed to function"<<endl;break;
case CUBLAS_STATUS_MAPPING_ERROR: cout<<"access to GPU memory space failed"<<endl;break;
case CUBLAS_STATUS_EXECUTION_FAILED: cout<<"GPU program failed to execute"<<endl;break;
case CUBLAS_STATUS_INTERNAL_ERROR: cout<<"an internal CUBLAS operation failed"<<endl;break;
default: cout<<"undefined error"<<endl;
}
}
// cout<<"error number: "<<statusCUBLAS<<endl;
// fprintf (stderr, "CUBLAS error.\n");};
}
void CIcpGpuCuda::getCudaErr(void){
cudaError_t error=cudaGetLastError();
cout<<cudaGetErrorString(error)<<endl;
}
//kd-tree based nearest neighbor search, using a priority queue: no parameters are needed??
void CIcpGpuCuda::class_nns_priority(
float* fDevScnX, //scene point cloud
float* fDevScnY,
float* fDevScnZ,
float* fDist, //squared distance between pairs, for deviation calculation
float* fDevSplit, //kd-tree: position of splitting plain (inner node)
unsigned* unDevIdx, //kd-tree: index of point (leaf node)
unsigned* unDevAxis, //kd-tree: axis where splitting plain locates (inner node)
bool* bDevIsLeaf, //kd-tree: node type (both nodes)
float* fDevLoBound, //kd-tree: lower bounding box (inner node)
float* fDevHiBound, //kd-tree: higher bounding box (inner node)
unsigned* unMask, //a 0-1 mask of pair and non-pairs.
float* fDevMdlPairX,
float* fDevMdlPairY,
float* fDevMdlPairZ,
float* fDevScnPairX,
float* fDevScnPairY,
float* fDevScnPairZ,
float fSearchRadius,
unsigned unSize,
unsigned unWidth,
unsigned unQStep)
{
wrapper_nns_priority( fDevScnX, fDevScnY, fDevScnZ,
fDist, fDevSplit, unDevIdx, unDevAxis, bDevIsLeaf, fDevLoBound, fDevHiBound,
unMask,
fDevMdlPairX, fDevMdlPairY, fDevMdlPairZ, fDevScnPairX, fDevScnPairY, fDevScnPairZ,
dimGrid, dimBlock, fSearchRadius,
unSize, unWidth,
unQStep );
}
//centralize a pointcloud
void CIcpGpuCuda::class_centralize(unsigned* unMask,
float* fDevMdlPairX,
float* fDevMdlPairY,
float* fDevMdlPairZ,
float* fDevScnPairX,
float* fDevScnPairY,
float* fDevScnPairZ,
float fcm0,
float fcm1,
float fcm2,
float fcs0,
float fcs1,
float fcs2,
float* fCenteredModX, //centered point cloud
float* fCenteredModY,
float* fCenteredModZ,
float* fCenteredScnX,
float* fCenteredScnY,
float* fCenteredScnZ){
wrapper_centralize(unMask,
fDevMdlPairX,fDevMdlPairY,fDevMdlPairZ,fDevScnPairX,fDevScnPairY,fDevScnPairZ,
fcm0,fcm1,fcm2,fcs0,fcs1,fcs2,
fCenteredModX,fCenteredModY,fCenteredModZ,fCenteredScnX,fCenteredScnY,fCenteredScnZ,
dimGrid, dimBlock);
}
// transform point cloud
void CIcpGpuCuda::class_transformation(float* fDevScnX, // point cloud to be transformed
float* fDevScnY,
float* fDevScnZ,
float m00, float m01, float m02, float m03,
float m10, float m11, float m12, float m13,
float m20, float m21, float m22, float m23)
{
wrapper_transformation(fDevScnX, fDevScnY, fDevScnZ,
m00, m01, m02, m03,
m10, m11, m12, m13,
m20, m21, m22, m23,
dimGrid, dimBlock);
}
void CIcpGpuCuda::setMinimums(float x, float y, float z)
{
min_x = x;
min_y = y;
min_z = z;
}
Matrix** CIcpGpuCuda::getMatrices(){
return matrices;
}
void CIcpGpuCuda::setTrans_Trans_inv(const double tr[], const double tr_inv[]){
trans = new Matrix(4,4);
trans_inv = new Matrix(4,4);
(*trans)(1,1) = tr[0]; (*trans)(2,1)=tr[1];(*trans)(3,1)=tr[2];(*trans)(4,1)=tr[3];
(*trans)(1,2) = tr[4]; (*trans)(2,2)=tr[5];(*trans)(3,2)=tr[6];(*trans)(4,2)=tr[7];
(*trans)(1,3) = tr[8]; (*trans)(2,3)=tr[9];(*trans)(3,3)=tr[10];(*trans)(4,3)=tr[11];
(*trans)(1,4) = tr[12]; (*trans)(2,4)=tr[13];(*trans)(3,4)=tr[14];(*trans)(4,4)=tr[15];
(*trans_inv)(1,1) = tr_inv[0]; (*trans_inv)(2,1)=tr_inv[1];(*trans_inv)(3,1)=tr_inv[2];(*trans_inv)(4,1)=tr_inv[3];
(*trans_inv)(1,2) = tr_inv[4]; (*trans_inv)(2,2)=tr_inv[5];(*trans_inv)(3,2)=tr_inv[6];(*trans_inv)(4,2)=tr_inv[7];
(*trans_inv)(1,3) = tr_inv[8]; (*trans_inv)(2,3)=tr_inv[9];(*trans_inv)(3,3)=tr_inv[10];(*trans_inv)(4,3)=tr_inv[11];
(*trans_inv)(1,4) = tr_inv[12]; (*trans_inv)(2,4)=tr_inv[13];(*trans_inv)(3,4)=tr_inv[14];(*trans_inv)(4,4)=tr_inv[15];
}
void CIcpGpuCuda::findNearestNeighbors(float fSearchRadius, unsigned unQStep)
{
// We make a copy of the existing scene point cloud in order to transform it
cudaMemcpy(cngfDevScnX, fDevScnX, unSizeData*sizeof(float), cudaMemcpyDeviceToDevice);
cudaMemcpy(cngfDevScnY, fDevScnY, unSizeData*sizeof(float), cudaMemcpyDeviceToDevice);
cudaMemcpy(cngfDevScnZ, fDevScnZ, unSizeData*sizeof(float), cudaMemcpyDeviceToDevice);
//////
class_transformation(cngfDevScnX, cngfDevScnY, cngfDevScnZ ,
(*trans_inv)(1,1), (*trans_inv)(1,2), (*trans_inv)(1,3), (*trans_inv)(1,4),
(*trans_inv)(2,1), (*trans_inv)(2,2), (*trans_inv)(2,3), (*trans_inv)(2,4),
(*trans_inv)(3,1), (*trans_inv)(3,2), (*trans_inv)(3,3), (*trans_inv)(3,4)
);
class_nns_priority( cngfDevScnX, cngfDevScnY, cngfDevScnZ,
fDist, fDevSplit, unDevIdx, unDevAxis, bDevIsLeaf, fDevLoBound, fDevHiBound,
unMask, fDevMdlPairX, fDevMdlPairY, fDevMdlPairZ, fDevScnPairX, fDevScnPairY, fDevScnPairZ,
fSearchRadius, unSizeData, _unWidth, unQStep);
class_transformation(fDevMdlPairX, fDevMdlPairY, fDevMdlPairZ,
(*trans)(1,1), (*trans)(1,2), (*trans)(1,3), (*trans)(1,4),
(*trans)(2,1), (*trans)(2,2), (*trans)(2,3), (*trans)(2,4),
(*trans)(3,1), (*trans)(3,2), (*trans)(3,3), (*trans)(3,4)
);
class_transformation(fDevScnPairX, fDevScnPairY, fDevScnPairZ,
(*trans)(1,1), (*trans)(1,2), (*trans)(1,3), (*trans)(1,4),
(*trans)(2,1), (*trans)(2,2), (*trans)(2,3), (*trans)(2,4),
(*trans)(3,1), (*trans)(3,2), (*trans)(3,3), (*trans)(3,4)
);
/*fDevScnX
cout<<"Trans Mat is : \n";
printMatrix(trans);
cout<<"Trans Inv Mat is : \n";
printMatrix(trans_inv);
*/
float tmpscn_x[10];
float tmpscn_y[10];
float tmpscn_z[10];
float tmpmdl_x[10];
float tmpmdl_y[10];
float tmpmdl_z[10];
cudaMemcpy(tmpscn_x, fDevScnPairX, 10*sizeof(float), cudaMemcpyDeviceToHost);
cudaMemcpy(tmpscn_y, fDevScnPairY, 10*sizeof(float), cudaMemcpyDeviceToHost);
cudaMemcpy(tmpscn_z, fDevScnPairZ, 10*sizeof(float), cudaMemcpyDeviceToHost);
cudaMemcpy(tmpmdl_x, fDevMdlPairX, 10*sizeof(float), cudaMemcpyDeviceToHost);
cudaMemcpy(tmpmdl_y, fDevMdlPairY, 10*sizeof(float), cudaMemcpyDeviceToHost);
cudaMemcpy(tmpmdl_z, fDevMdlPairZ, 10*sizeof(float), cudaMemcpyDeviceToHost);
}
Matrix CIcpGpuCuda::fillHomoMatrix(Matrix* R, double* dTranslation){
//Fill result
Matrix matrix(4,4);
matrix(1,1) = (*R)(1,1);
matrix(1,2) = (*R)(1,2);
matrix(1,3) = (*R)(1,3);
matrix(1,4) = dTranslation[0];
matrix(2,1) = (*R)(2,1);
matrix(2,2) = (*R)(2,2);
matrix(2,3) = (*R)(2,3);
matrix(2,4) = dTranslation[1];
matrix(3,1) = (*R)(3,1);
matrix(3,2) = (*R)(3,2);
matrix(3,3) = (*R)(3,3);
matrix(3,4) = dTranslation[2];
matrix(4,1) = 0;
matrix(4,2) = 0;
matrix(4,3) = 0;
matrix(4,4) = 1;
return matrix;
}
void CIcpGpuCuda::computeCentroid(float* x, float* y, float* z, float*& center){
/*
cublasSasum function works only with absolute values.
So I created an array of ones and used a dot product
in order to simulate a simple sum function
*/
center[0] = cublasSdot(unSizeData,x,1,ones,1);
center[0] /=unPairs;
center[1] = cublasSdot(unSizeData,y,1,ones,1);
center[1] /=unPairs;
center[2] = cublasSdot(unSizeData,z,1,ones,1);
center[2] /=unPairs;
}
Matrix CIcpGpuCuda::computeHMatrix(){
Matrix H(3,3);
H = 0.0;
unsigned unSizeOfSec = 200000; // need to be tuned for best performance!// +++ Fill by gpu +++
if(unSizeData<=unSizeOfSec){
H(1,1) = (double)cublasSdot(unSizeData,fCenScnX,1,fCenModX,1);
H(1,2) = (double)cublasSdot(unSizeData,fCenScnX,1,fCenModY,1);
H(1,3) = (double)cublasSdot(unSizeData,fCenScnX,1,fCenModZ,1);
H(2,1) = (double)cublasSdot(unSizeData,fCenScnY,1,fCenModX,1);
H(2,2) = (double)cublasSdot(unSizeData,fCenScnY,1,fCenModY,1);
H(2,3) = (double)cublasSdot(unSizeData,fCenScnY,1,fCenModZ,1);
H(3,1) = (double)cublasSdot(unSizeData,fCenScnZ,1,fCenModX,1);
H(3,2) = (double)cublasSdot(unSizeData,fCenScnZ,1,fCenModY,1);
H(3,3) = (double)cublasSdot(unSizeData,fCenScnZ,1,fCenModZ,1);
}
else{
unsigned unSections = (unsigned)(unSizeData/unSizeOfSec);//cout<<"unSections: "<<unSections<<endl;
unsigned unStub = (unsigned)(unSizeData%unSizeOfSec);//cout<<"unStub: "<<unStub<<endl;
for(unsigned i=0;i<unSections;i++){
H(1,1) += (double)cublasSdot(unSizeOfSec,fCenScnX+i*unSizeOfSec,1,fCenModX+i*unSizeOfSec,1);
H(1,2) += (double)cublasSdot(unSizeOfSec,fCenScnX+i*unSizeOfSec,1,fCenModY+i*unSizeOfSec,1);
H(1,3) += (double)cublasSdot(unSizeOfSec,fCenScnX+i*unSizeOfSec,1,fCenModZ+i*unSizeOfSec,1);
H(2,1) += (double)cublasSdot(unSizeOfSec,fCenScnY+i*unSizeOfSec,1,fCenModX+i*unSizeOfSec,1);
H(2,2) += (double)cublasSdot(unSizeOfSec,fCenScnY+i*unSizeOfSec,1,fCenModY+i*unSizeOfSec,1);
H(2,3) += (double)cublasSdot(unSizeOfSec,fCenScnY+i*unSizeOfSec,1,fCenModZ+i*unSizeOfSec,1);
H(3,1) += (double)cublasSdot(unSizeOfSec,fCenScnZ+i*unSizeOfSec,1,fCenModX+i*unSizeOfSec,1);
H(3,2) += (double)cublasSdot(unSizeOfSec,fCenScnZ+i*unSizeOfSec,1,fCenModY+i*unSizeOfSec,1);
H(3,3) += (double)cublasSdot(unSizeOfSec,fCenScnZ+i*unSizeOfSec,1,fCenModZ+i*unSizeOfSec,1);
}
if(unStub){
H(1,1) += (double)cublasSdot(unStub,fCenScnX+unSections*unSizeOfSec,1,fCenModX+unSections*unSizeOfSec,1);
H(1,2) += (double)cublasSdot(unStub,fCenScnX+unSections*unSizeOfSec,1,fCenModY+unSections*unSizeOfSec,1);
H(1,3) += (double)cublasSdot(unStub,fCenScnX+unSections*unSizeOfSec,1,fCenModZ+unSections*unSizeOfSec,1);
H(2,1) += (double)cublasSdot(unStub,fCenScnY+unSections*unSizeOfSec,1,fCenModX+unSections*unSizeOfSec,1);
H(2,2) += (double)cublasSdot(unStub,fCenScnY+unSections*unSizeOfSec,1,fCenModY+unSections*unSizeOfSec,1);
H(2,3) += (double)cublasSdot(unStub,fCenScnY+unSections*unSizeOfSec,1,fCenModZ+unSections*unSizeOfSec,1);
H(3,1) += (double)cublasSdot(unStub,fCenScnZ+unSections*unSizeOfSec,1,fCenModX+unSections*unSizeOfSec,1);
H(3,2) += (double)cublasSdot(unStub,fCenScnZ+unSections*unSizeOfSec,1,fCenModY+unSections*unSizeOfSec,1);
H(3,3) += (double)cublasSdot(unStub,fCenScnZ+unSections*unSizeOfSec,1,fCenModZ+unSections*unSizeOfSec,1);
}
}
getCublasErr();
return H;
}
void CIcpGpuCuda::printMatrix(Matrix* mat){
for(int i = 1 ; i < 5 ; ++i)
cout<<(*mat)(i,1)<< " "<<(*mat)(i,2)<< " "<<(*mat)(i,3)<<
" "<<(*mat)(i,4)<< endl;
}