- Added OpenMP code, it is enabled by defining PARALLEL=1 for the elbeem
authorNils Thuerey <nils@thuerey.de>
Wed, 21 Nov 2007 22:12:16 +0000 (22:12 +0000)
committerNils Thuerey <nils@thuerey.de>
Wed, 21 Nov 2007 22:12:16 +0000 (22:12 +0000)
  compilation.  Currently, it is not yet active by default, but
  Genscher wanted to do some tests.
  It can be used to distribute the computation load onto multiple shared-
  memory CPUs by splitting the domain along the y-axis (assuming a
  gravity force along z). However, there is no load balancing: so
  if there's fluid only in one of the y-axis halves you will not get
  a speedup for 2 CPUs.

- Added a fix for the memory allocation bugs #7120 and #6775. In
  solver_init.cpp there are now several variables max___MemChunk
  (line 692+), that set upper limits for various systems. The same
  problem existed for mac & linux, but the limit is higher, so
  it probably went by undetected. The windows limit is currently 1GB,
  if the strange 700MB limit problems mentioned in the bug regports the
  bugs persist, this could be further reduced. For 64bit compilations
  this problem shouldn't exist anyway.
  What's still missing is a display of how much the resolution was
  reduced to fit into memory...

- And some minor solver code cleanup.

14 files changed:
intern/elbeem/extern/elbeem.h
intern/elbeem/intern/attributes.cpp
intern/elbeem/intern/loop_tools.h
intern/elbeem/intern/ntl_vector3dim.h
intern/elbeem/intern/particletracer.cpp
intern/elbeem/intern/simulation_object.cpp
intern/elbeem/intern/solver_class.h
intern/elbeem/intern/solver_init.cpp
intern/elbeem/intern/solver_interface.cpp
intern/elbeem/intern/solver_interface.h
intern/elbeem/intern/solver_main.cpp
intern/elbeem/intern/solver_util.cpp
intern/elbeem/intern/utilities.cpp
intern/elbeem/intern/utilities.h

index b3feda8bbe8be0dee4385b1112080fdda6875707..2a594dd07e65147fd443513bba72b9207009f0ad 100644 (file)
@@ -154,7 +154,7 @@ typedef struct elbeemMesh {
        short volumeInitType;
 
        /* name of the mesh, mostly for debugging */
-       char *name;
+       const char *name;
 } elbeemMesh;
 
 // API functions
index 8e337a92a4e61648629372332d84d488b4b3e4f2..464486f2500ea1b995de7bfcc940d7838d1fa0ac 100644 (file)
@@ -103,7 +103,7 @@ void AttributeList::readMat4Gfx(string name, ntlMat4Gfx defaultValue, string sou
 
 // set that a parameter can be given, and will be ignored...
 bool AttributeList::ignoreParameter(string name, string source) {
-       name=source=(""); // remove warning
+       name = source = ("");
        return false;
 }
                
index 3c15a6092103ad1ec2f65a61f720b40d16441264..70ecb9ce3e080cec9fc77b4f042b607d050e6c7e 100644 (file)
 
 
        
-#define unused_GRID_REGION_END() \
-       } /* main_region */  \
-       // end unusedGRID_REGION_END
-
 
 //  -----------------------------------------------------------------------------------
 #else // PARALLEL==1
 
-#include "paraloop.h"
+//#include "paraloop.h"
+#define PERFORM_USQRMAXCHECK USQRMAXCHECK(usqr,ux,uy,uz, calcMaxVlen, calcMxvx,calcMxvy,calcMxvz);
+#define LIST_EMPTY(x)    calcListEmpty.push_back( x );
+#define LIST_FULL(x)     calcListFull.push_back( x );
+#define FSGR_ADDPART(x)  calcListParts.push_back( x );
+
+
+// parallel region
+//was: # pragma omp parallel default(shared) 
+#if COMPRESSGRIDS!=1
+       // requires compressed grids...!
+       ERROR!
+#endif
+
+// loop start
+#define  GRID_REGION_START()  \
+       { \
+        \
+        \
+       if(mSizez<2) { \
+       mPanic = 1; \
+       errFatal("ParaLoop::2D","Not valid...!", SIMWORLD_GENERICERROR); \
+       } \
+        \
+        \
+       vector<LbmPoint> calcListFull; \
+       vector<LbmPoint> calcListEmpty; \
+       vector<ParticleObject> calcListParts; \
+       LbmFloat calcMxvx, calcMxvy, calcMxvz, calcMaxVlen; \
+       calcMxvx = calcMxvy = calcMxvz = calcMaxVlen = 0.0; \
+       calcListEmpty.reserve(mListEmpty.capacity() / omp_get_num_threads() ); \
+       calcListFull.reserve( mListFull.capacity()  / omp_get_num_threads() ); \
+       calcListParts.reserve(mSizex); \
+        \
+        \
+       const int id = omp_get_thread_num(); \
+       const int Nthrds = omp_get_num_threads(); \
+        \
+        \
+        \
+        \
+        \
+       int kdir = 1; \
+        \
+       int kstart=getForZMinBnd(), kend=getForZMaxBnd(mMaxRefine); \
+       if(gridLoopBound>0){ kstart=getForZMin1(); kend=getForZMax1(mMaxRefine); } \
+       LbmFloat *ccel = NULL, *tcel = NULL; \
+       CellFlagType *pFlagSrc=NULL, *pFlagDst=NULL; \
+        \
+        \
+       if(mLevel[mMaxRefine].setCurr==1) { \
+       kdir = -1; \
+       int temp = kend; \
+       kend = kstart-1; \
+       kstart = temp-1; \
+       } \
+        \
+       const int Nj = mLevel[mMaxRefine].lSizey; \
+       int jstart = 0+( id * (Nj / Nthrds) ); \
+       int jend   = 0+( (id+1) * (Nj / Nthrds) ); \
+       if( ((Nj/Nthrds) *Nthrds) != Nj) { \
+       errMsg("LbmFsgrSolver","Invalid domain size Nj="<<Nj<<" Nthrds="<<Nthrds); \
+       } \
+        \
+       if(jstart<gridLoopBound) jstart = gridLoopBound; \
+       if(jend>mLevel[mMaxRefine].lSizey-gridLoopBound) jend = mLevel[mMaxRefine].lSizey-gridLoopBound; \
+        \
+       debMsgStd("ParaLoop::OMP",DM_MSG,"Thread:"<<id<<" i:"<<istart<<"-"<<iend<<" j:"<<jstart<<"-"<<jend<<", k:"<<kstart<<"-"<<kend<<"  ", 1); \
+        \
+
+
+
+
+// para GRID LOOP END is parainc3 
 
 #endif // PARALLEL==1
 
 
 
 
+
 // old loop for COMPRESSGRIDS==0
 #define old__GRID_LOOP_START() \
   for(int k=kstart;k<kend;++k) { \
          for(int j=1;j<mLevel[lev].lSizey-1;++j) { \
                for(int i=0;i<mLevel[lev].lSizex-2;   ) {
 
+
index 912a37350c127f2b0c984787a60ea2663b4d660e..27c3be0d71f143758b8e8ec3e1f53738bd115a31 100644 (file)
@@ -22,6 +22,7 @@
 #include <math.h>
 #include <string.h>
 #include <stdio.h>
+#include <stdlib.h>
 
 // hack for MSVC6.0 compiler
 #ifdef _MSC_VER
index c9da808543a0a9f8acc77430a9c71e43fa1f31f2..819fcdd0b9aacd25b868276bd5a43f5094decde9 100644 (file)
@@ -325,7 +325,7 @@ void ParticleTracer::getTriangles(double time, vector<ntlTriangle> *triangles,
        // suppress warnings...
        vertices = NULL; triangles = NULL;
        normals = NULL; objectId = 0;
-       time = 0.0;
+       time = 0.;
 #else // ELBEEM_PLUGIN
        int pcnt = 0;
        // currently not used in blender
index 2ff600a36d4ee07324db1acfa997da8059371b48..9b47ae696afff4a2d784e0a00f23406d59034da1 100644 (file)
@@ -15,7 +15,6 @@
 #include "solver_interface.h"
 #include "particletracer.h"
 #include "elbeem.h"
-#include <stdlib.h> /* exit(3) - also in linux */
 
 #ifdef _WIN32
 #else
@@ -69,6 +68,7 @@ SimulationObject::~SimulationObject()
 /*! init tree for certain geometry init */
 /*****************************************************************************/
 void SimulationObject::initGeoTree() {
+       // unused!! overriden by solver interface       
        if(mpGlob == NULL) { 
                errFatal("SimulationObject::initGeoTree error","Requires globals!", SIMWORLD_INITERROR); 
                return;
@@ -80,7 +80,7 @@ void SimulationObject::initGeoTree() {
        char treeFlag = (1<<(mGeoInitId+4));
        mpGiTree = new ntlTree( 20, 4, // warning - fixed values for depth & maxtriangles here...
                                                                                                scene, treeFlag );
-       exit(1); // unused!? overriden by solver interface      
+       // unused!! overriden by solver interface       
 }
 
 /*****************************************************************************/
@@ -310,7 +310,7 @@ void SimulationObject::step( void )
                // dont advance for stopped time
                mpLbm->step();
                mTime += mpParam->getTimestep();
-//if(mTime>0.001) { errMsg("DEBUG!!!!!!!!","quit mlsu..."); exit(1); } // PROFILE DEBUG TEST!
+               //if(mTime>0.001) { errMsg("DEBUG!!!!!!!!","quit mlsu..."); xit(1); } // PROFILE DEBUG TEST!
        }
        if(mpLbm->getPanic()) mPanic = true;
 
index 930c1863aa7bce2948be79753fa38a0bb22ddb0a..d46f065adfd6632ed9aa4b58479ce0a510b42c1a 100644 (file)
 // sirdude fix for solaris
 #if !defined(linux) && defined(sun)
 #ifndef expf
-#define expf(a)                exp((double)(a))
+#define expf(x) exp((double)(x))
 #endif
 #endif
 
index abec4a89c89b1f4abb726769c77caa29f3bee58c..b0ce130c13684b330360ee57a6c27aa39c593aea 100644 (file)
@@ -655,6 +655,7 @@ bool LbmFsgrSolver::initializeSolverMemory()
        int orgSz = mSizez;
        double sizeReduction = 1.0;
        double memEstFromFunc = -1.0;
+       double memEstFine = -1.0;
        string memreqStr("");   
        bool firstMInit = true;
        int minitTries=0;
@@ -672,7 +673,7 @@ bool LbmFsgrSolver::initializeSolverMemory()
                firstMInit=false;
 
                calculateMemreqEstimate( mSizex, mSizey, mSizez, 
-                               mMaxRefine, mFarFieldSize, &memEstFromFunc, &memreqStr );
+                               mMaxRefine, mFarFieldSize, &memEstFromFunc, &memEstFine, &memreqStr );
                
                double memLimit;
                string memLimStr("-");
@@ -685,13 +686,36 @@ bool LbmFsgrSolver::initializeSolverMemory()
                        memLimit = 16.0* 1024.0*1024.0*1024.0;
                        memLimStr = string("16GB");
                }
-               if(memEstFromFunc>memLimit) {
+
+               // restrict max. chunk of 1 mem block to 1GB for windos
+               bool memBlockAllocProblem = false;
+               double maxWinMemChunk = 1100.*1024.*1024.;
+               double maxMacMemChunk = 1200.*1024.*1024.;
+               double maxDefaultMemChunk = 2.*1024.*1024.*1024.;
+               //std::cerr<<" memEstFine "<< memEstFine <<" maxWin:" <<maxWinMemChunk <<" maxMac:" <<maxMacMemChunk ; // DEBUG
+#ifdef WIN32
+               if(memEstFine> maxWinMemChunk) {
+                       memBlockAllocProblem = true;
+               }
+#endif // WIN32
+#ifdef __APPLE__
+               if(memEstFine> maxMacMemChunk) {
+                       memBlockAllocProblem = true;
+               }
+#endif // Mac
+               if(sizeof(int)==4 && memEstFine>maxDefaultMemChunk) {
+                       // max memory chunk for 32bit systems 2gig
+                       memBlockAllocProblem = true;
+               }
+
+               if(memEstFromFunc>memLimit || memBlockAllocProblem) {
                        sizeReduction *= 0.9;
                        mSizex = (int)(orgSx * sizeReduction);
                        mSizey = (int)(orgSy * sizeReduction);
                        mSizez = (int)(orgSz * sizeReduction);
                        debMsgStd("LbmFsgrSolver::initialize",DM_WARNING,"initGridSizes: memory limit exceeded "<<
                                        //memEstFromFunc<<"/"<<memLimit<<", "<<
+                                       //memEstFine<<"/"<<maxWinMemChunk<<", "<<
                                        memreqStr<<"/"<<memLimStr<<", "<<
                                        "retrying: "<<PRINT_VEC(mSizex,mSizey,mSizez)<<" org:"<<PRINT_VEC(orgSx,orgSy,orgSz)
                                        , 3 );
@@ -778,10 +802,6 @@ bool LbmFsgrSolver::initializeSolverMemory()
        mLevel[ mMaxRefine ].simCellSize = mpParam->getCellSize();
        mLevel[ mMaxRefine ].lcellfactor = 1.0;
        LONGINT rcellSize = ((mLevel[mMaxRefine].lSizex*mLevel[mMaxRefine].lSizey*mLevel[mMaxRefine].lSizez) *dTotalNum);
-       // +4 for safety ?
-       mLevel[ mMaxRefine ].mprsFlags[0] = new CellFlagType[ rcellSize/dTotalNum +4 ];
-       mLevel[ mMaxRefine ].mprsFlags[1] = new CellFlagType[ rcellSize/dTotalNum +4 ];
-       ownMemCheck += 2 * sizeof(CellFlagType) * (rcellSize/dTotalNum +4);
 
 #if COMPRESSGRIDS==0
        mLevel[ mMaxRefine ].mprsCells[0] = new LbmFloat[ rcellSize +4 ];
@@ -789,11 +809,34 @@ bool LbmFsgrSolver::initializeSolverMemory()
        ownMemCheck += 2 * sizeof(LbmFloat) * (rcellSize+4);
 #else // COMPRESSGRIDS==0
        LONGINT compressOffset = (mLevel[mMaxRefine].lSizex*mLevel[mMaxRefine].lSizey*dTotalNum*2);
+       // D int tmp = ( (rcellSize +compressOffset +4)/(1024*1024) )*4;
+       // D printf("Debug MEMMMM excee: %d\n", tmp);
        mLevel[ mMaxRefine ].mprsCells[1] = new LbmFloat[ rcellSize +compressOffset +4 ];
        mLevel[ mMaxRefine ].mprsCells[0] = mLevel[ mMaxRefine ].mprsCells[1]+compressOffset;
        ownMemCheck += sizeof(LbmFloat) * (rcellSize +compressOffset +4);
 #endif // COMPRESSGRIDS==0
 
+       if(!mLevel[ mMaxRefine ].mprsCells[1] || !mLevel[ mMaxRefine ].mprsCells[0]) {
+               errFatal("LbmFsgrSolver::initialize","Fatal: Couldnt allocate memory (1)! Aborting...",SIMWORLD_INITERROR);
+               return false;
+       }
+
+       // +4 for safety ?
+       mLevel[ mMaxRefine ].mprsFlags[0] = new CellFlagType[ rcellSize/dTotalNum +4 ];
+       mLevel[ mMaxRefine ].mprsFlags[1] = new CellFlagType[ rcellSize/dTotalNum +4 ];
+       ownMemCheck += 2 * sizeof(CellFlagType) * (rcellSize/dTotalNum +4);
+       if(!mLevel[ mMaxRefine ].mprsFlags[1] || !mLevel[ mMaxRefine ].mprsFlags[0]) {
+               errFatal("LbmFsgrSolver::initialize","Fatal: Couldnt allocate memory (2)! Aborting...",SIMWORLD_INITERROR);
+
+#if COMPRESSGRIDS==0
+               delete[] mLevel[ mMaxRefine ].mprsCells[0];
+               delete[] mLevel[ mMaxRefine ].mprsCells[1];
+#else // COMPRESSGRIDS==0
+               delete[] mLevel[ mMaxRefine ].mprsCells[1];
+#endif // COMPRESSGRIDS==0
+               return false;
+       }
+
        LbmFloat lcfdimFac = 8.0;
        if(LBMDIM==2) lcfdimFac = 4.0;
        for(int i=mMaxRefine-1; i>=0; i--) {
index 2539556617bb4752e6d81b4f07e4c3e4ba92ad25..d25850a003be81de8a3894110a15b789a8c55070 100644 (file)
@@ -17,7 +17,6 @@
 #include "ntl_world.h"
 #include "elbeem.h"
 
-#include <stdlib.h> /* getenv(3) - also in linux */
 
 
 
@@ -142,7 +141,7 @@ void initGridSizes(int &sizex, int &sizey, int &sizez,
 
 void calculateMemreqEstimate( int resx,int resy,int resz, 
                int refine, float farfield,
-               double *reqret, string *reqstr) {
+               double *reqret, double *reqretFine, string *reqstr) {
        // debug estimation?
        const bool debugMemEst = true;
        // COMPRESSGRIDS define is not available here, make sure it matches
@@ -150,6 +149,7 @@ void calculateMemreqEstimate( int resx,int resy,int resz,
        // make sure we can handle bid numbers here... all double
        double memCnt = 0.0;
        double ddTotalNum = (double)dTotalNum;
+       if(reqretFine) *reqretFine = -1.;
 
        double currResx = (double)resx;
        double currResy = (double)resy;
@@ -159,10 +159,12 @@ void calculateMemreqEstimate( int resx,int resy,int resz,
        if(debugMemEst) debMsgStd("calculateMemreqEstimate",DM_MSG,"res:"<<PRINT_VEC(currResx,currResy,currResz)<<" rcellSize:"<<rcellSize<<" mc:"<<memCnt, 10);
   if(!useGridComp) {
                memCnt += (double)(sizeof(LbmFloat) * (rcellSize +4.0) *2.0);
+               if(reqretFine) *reqretFine = (double)(sizeof(LbmFloat) * (rcellSize +4.0) *2.0);
                if(debugMemEst) debMsgStd("calculateMemreqEstimate",DM_MSG," no-comp, mc:"<<memCnt, 10);
        } else {
                double compressOffset = (double)(currResx*currResy*ddTotalNum*2.0);
                memCnt += (double)(sizeof(LbmFloat) * (rcellSize+compressOffset +4.0));
+               if(reqretFine) *reqretFine = (double)(sizeof(LbmFloat) * (rcellSize+compressOffset +4.0));
                if(debugMemEst) debMsgStd("calculateMemreqEstimate",DM_MSG," w-comp, mc:"<<memCnt, 10);
        }
        for(int i=refine-1; i>=0; i--) {
index 1dfdf156ee50b6d28436694f4d52d038c0d4cbeb..c3dc4983cac33c221b7fb160e59b9731834918a0 100644 (file)
@@ -21,7 +21,7 @@
 #if LBM_USE_GUI==1
 #define USE_GLUTILITIES
 // for debug display
-#include <GL/gl.h>
+//#include <GL/gl.h>
 #include "../gui/guifuncs.h"
 #endif
 
@@ -596,8 +596,10 @@ class LbmSolverInterface
 void initGridSizes(int &mSizex, int &mSizey, int &mSizez,
                ntlVec3Gfx &mvGeoStart, ntlVec3Gfx &mvGeoEnd, 
                int mMaxRefine, bool parallel);
+// return the amount of memory required in total (reqret)
+// and for the finest grid only (reqretFine, can be NULL)
 void calculateMemreqEstimate(int resx,int resy,int resz, int refine,
-               float farfieldsize, double *reqret, string *reqstr);
+               float farfieldsize, double *reqret, double *reqretFine, string *reqstr);
 
 //! helper function to convert flag to string (for debuggin)
 string convertCellFlagType2String( CellFlagType flag );
index 270e8867b3cea2a6e7f4986143cb7bd407ea8bb9..afc883972e2326fa212d0832b0bee5bda3837779 100644 (file)
@@ -7,11 +7,11 @@
  *
  *****************************************************************************/
 
-#include <stdlib.h> /* rand(3) - also in linux */
 #include "solver_class.h"
 #include "solver_relax.h"
 #include "particletracer.h"
 #include "loop_tools.h"
+#include <stdlib.h>
 
 /*****************************************************************************/
 /*! perform a single LBM step */
@@ -375,7 +375,11 @@ LbmFsgrSolver::mainLoop(int lev)
        const int gridLoopBound=1;
        GRID_REGION_INIT();
 #if PARALLEL==1
-#include "paraloopstart.h"
+#pragma omp parallel default(shared) \
+  reduction(+: \
+         calcCurrentMass,calcCurrentVolume, \
+               calcCellsFilled,calcCellsEmptied, \
+               calcNumUsedCells )
        GRID_REGION_START();
 #else // PARALLEL==1
        GRID_REGION_START();
@@ -1112,7 +1116,11 @@ LbmFsgrSolver::preinitGrids()
        
                GRID_REGION_INIT();
 #if PARALLEL==1
-#include "paraloopstart.h"
+#pragma omp parallel default(shared) \
+  reduction(+: \
+         calcCurrentMass,calcCurrentVolume, \
+               calcCellsFilled,calcCellsEmptied, \
+               calcNumUsedCells )
 #endif // PARALLEL==1
                GRID_REGION_START();
                GRID_LOOP_START();
@@ -1145,7 +1153,11 @@ LbmFsgrSolver::standingFluidPreinit()
 
        GRID_REGION_INIT();
 #if PARALLEL==1
-#include "paraloopstart.h"
+#pragma omp parallel default(shared) \
+  reduction(+: \
+         calcCurrentMass,calcCurrentVolume, \
+               calcCellsFilled,calcCellsEmptied, \
+               calcNumUsedCells )
 #endif // PARALLEL==1
        GRID_REGION_START();
 
index 65cc2200d4e624a436f41a596631e16defdbf167..a6685babe680b22aafc992d605b4c41b77846169 100644 (file)
@@ -15,8 +15,7 @@
 #include "ntl_world.h"
 #include "simulation_object.h"
 
-#include <stdlib.h> /* rand(3) */
-
+#include <stdlib.h>
 #include <zlib.h>
 #ifndef sqrtf
 #define sqrtf sqrt
index 332052e91b62ba3450dbc19d7c3f1a7429e77c98..551c4d0d3842769ee065de69df065cfd53988847 100644 (file)
@@ -10,7 +10,6 @@
 
 #include <iostream>
 #include <sstream>
-#include <stdlib.h> /* getenv(3), strtol(3) */
 #ifdef WIN32
 // for timing
 #include <windows.h>
@@ -482,7 +481,7 @@ double elbeemEstimateMemreq(int res,
        double memreq = -1.0;
        string memreqStr("");   
        // ignore farfield for now...
-       calculateMemreqEstimate(resx,resy,resz, refine, 0., &memreq, &memreqStr );
+       calculateMemreqEstimate(resx,resy,resz, refine, 0., &memreq, NULL, &memreqStr );
 
        if(retstr) { 
                // copy at max. 32 characters
index 0f65408d23c7c0bd033a8437c60483909397c5ff..825e92251fe5d4d7705fb06b3878631ce3648730 100644 (file)
@@ -9,11 +9,6 @@
 #ifndef UTILITIES_H
 #include "ntl_vector3dim.h"
 
-// Solaris requires ieeefp.h for finite(3C)
-#if !defined(linux) && defined(sun)
-#include <ieeefp.h>
-#endif
-
 
 /* debugging outputs , debug level 0 (off) to 10 (max) */
 #ifdef ELBEEM_PLUGIN