/* Variants that use a constant stride on GPUS. */
#ifdef __KERNEL_GPU__
-#define MATS(A, n, r, c, s) A[((r)*(n)+(c))*(s)]
+# define MATS(A, n, r, c, s) A[((r)*(n)+(c))*(s)]
/* Element access when only the lower-triangular elements are stored. */
-#define MATHS(A, r, c, s) A[((r)*((r)+1)/2+(c))*(s)]
-#define VECS(V, i, s) V[(i)*(s)]
+# define MATHS(A, r, c, s) A[((r)*((r)+1)/2+(c))*(s)]
+# define VECS(V, i, s) V[(i)*(s)]
#else
-#define MATS(A, n, r, c, s) MAT(A, n, r, c)
-#define MATHS(A, r, c, s) A[(r)*((r)+1)/2+(c)]
-#define VECS(V, i, s) V[i]
+# define MATS(A, n, r, c, s) MAT(A, n, r, c)
+# define MATHS(A, r, c, s) A[(r)*((r)+1)/2+(c)]
+# define VECS(V, i, s) V[i]
#endif
/* Zeroing helpers. */
ccl_device_inline void math_vector_zero(float *v, int n)
{
- for(int i = 0; i < n; i++)
+ for(int i = 0; i < n; i++) {
v[i] = 0.0f;
+ }
}
ccl_device_inline void math_matrix_zero(float *A, int n)
{
- for(int row = 0; row < n; row++)
- for(int col = 0; col <= row; col++)
+ for(int row = 0; row < n; row++) {
+ for(int col = 0; col <= row; col++) {
MAT(A, n, row, col) = 0.0f;
+ }
+ }
}
/* Elementary vector operations. */
ccl_device_inline void math_vector_add(float *a, const float *ccl_restrict b, int n)
{
- for(int i = 0; i < n; i++)
+ for(int i = 0; i < n; i++) {
a[i] += b[i];
+ }
}
ccl_device_inline void math_vector_mul(float *a, const float *ccl_restrict b, int n)
{
- for(int i = 0; i < n; i++)
+ for(int i = 0; i < n; i++) {
a[i] *= b[i];
+ }
}
ccl_device_inline void math_vector_mul_strided(ccl_global float *a, const float *ccl_restrict b, int astride, int n)
{
- for(int i = 0; i < n; i++)
+ for(int i = 0; i < n; i++) {
a[i*astride] *= b[i];
+ }
}
ccl_device_inline void math_vector_scale(float *a, float b, int n)
{
- for(int i = 0; i < n; i++)
+ for(int i = 0; i < n; i++) {
a[i] *= b;
+ }
}
ccl_device_inline void math_vector_max(float *a, const float *ccl_restrict b, int n)
{
- for(int i = 0; i < n; i++)
+ for(int i = 0; i < n; i++) {
a[i] = max(a[i], b[i]);
+ }
}
ccl_device_inline void math_vec3_add(float3 *v, int n, float *x, float3 w)
{
- for(int i = 0; i < n; i++)
+ for(int i = 0; i < n; i++) {
v[i] += w*x[i];
+ }
}
ccl_device_inline void math_vec3_add_strided(ccl_global float3 *v, int n, float *x, float3 w, int stride)
{
- for(int i = 0; i < n; i++)
- v[i*stride] += w*x[i];
+ for(int i = 0; i < n; i++) {
+ ccl_global float *elem = (ccl_global float*) (v + i*stride);
+ atomic_add_and_fetch_float(elem+0, w.x*x[i]);
+ atomic_add_and_fetch_float(elem+1, w.y*x[i]);
+ atomic_add_and_fetch_float(elem+2, w.z*x[i]);
+ }
}
/* Elementary matrix operations.
ccl_device_inline void math_trimatrix_add_diagonal(ccl_global float *A, int n, float val, int stride)
{
- for(int row = 0; row < n; row++)
+ for(int row = 0; row < n; row++) {
MATHS(A, row, row, stride) += val;
+ }
}
/* Add Gramian matrix of v to A.
const float *ccl_restrict v,
float weight)
{
- for(int row = 0; row < n; row++)
- for(int col = 0; col <= row; col++)
+ for(int row = 0; row < n; row++) {
+ for(int col = 0; col <= row; col++) {
MAT(A, n, row, col) += v[row]*v[col]*weight;
+ }
+ }
}
/* Add Gramian matrix of v to A.
float weight,
int stride)
{
- for(int row = 0; row < n; row++)
- for(int col = 0; col <= row; col++)
- MATHS(A, row, col, stride) += v[row]*v[col]*weight;
+ for(int row = 0; row < n; row++) {
+ for(int col = 0; col <= row; col++) {
+ atomic_add_and_fetch_float(&MATHS(A, row, col, stride), v[row]*v[col]*weight);
+ }
+ }
}
/* Transpose matrix A inplace. */
}
}
-
-
-
/* Solvers for matrix problems */
/* In-place Cholesky-Banachiewicz decomposition of the square, positive-definite matrix A
}
}
-
-
-
-
/* Perform the Jacobi Eigenvalue Methon on matrix A.
* A is assumed to be a symmetrical matrix, therefore only the lower-triangular part is ever accessed.
* The algorithm overwrites the contents of A.
{
const float singular_epsilon = 1e-9f;
- for (int row = 0; row < n; row++)
- for (int col = 0; col < n; col++)
+ for(int row = 0; row < n; row++) {
+ for(int col = 0; col < n; col++) {
MATS(V, n, row, col, v_stride) = (col == row) ? 1.0f : 0.0f;
+ }
+ }
- for (int sweep = 0; sweep < 8; sweep++) {
+ for(int sweep = 0; sweep < 8; sweep++) {
float off_diagonal = 0.0f;
- for (int row = 1; row < n; row++)
- for (int col = 0; col < row; col++)
+ for(int row = 1; row < n; row++) {
+ for(int col = 0; col < row; col++) {
off_diagonal += fabsf(MAT(A, n, row, col));
- if (off_diagonal < 1e-7f) {
+ }
+ }
+ if(off_diagonal < 1e-7f) {
/* The matrix has nearly reached diagonal form.
* Since the eigenvalues are only used to determine truncation, their exact values aren't required - a relative error of a few ULPs won't matter at all. */
break;
float abs_element = fabsf(element);
/* If we're in a later sweep and the element already is very small, just set it to zero and skip the rotation. */
- if (sweep > 3 && abs_element <= singular_epsilon*fabsf(MAT(A, n, row, row)) && abs_element <= singular_epsilon*fabsf(MAT(A, n, col, col))) {
+ if(sweep > 3 && abs_element <= singular_epsilon*fabsf(MAT(A, n, row, row)) && abs_element <= singular_epsilon*fabsf(MAT(A, n, col, col))) {
MAT(A, n, row, col) = 0.0f;
continue;
}
* Then, we compute sin(phi) and cos(phi) themselves. */
float singular_diff = MAT(A, n, row, row) - MAT(A, n, col, col);
float ratio;
- if (abs_element > singular_epsilon*fabsf(singular_diff)) {
+ if(abs_element > singular_epsilon*fabsf(singular_diff)) {
float cot_2phi = 0.5f*singular_diff / element;
ratio = 1.0f / (fabsf(cot_2phi) + sqrtf(1.0f + cot_2phi*cot_2phi));
- if (cot_2phi < 0.0f) ratio = -ratio; /* Copy sign. */
+ if(cot_2phi < 0.0f) ratio = -ratio; /* Copy sign. */
}
else {
ratio = element / singular_diff;
}
/* Sort eigenvalues and the associated eigenvectors. */
- for (int i = 0; i < n - 1; i++) {
+ for(int i = 0; i < n - 1; i++) {
float v = MAT(A, n, i, i);
int k = i;
- for (int j = i; j < n; j++) {
- if (MAT(A, n, j, j) >= v) {
+ for(int j = i; j < n; j++) {
+ if(MAT(A, n, j, j) >= v) {
v = MAT(A, n, j, j);
k = j;
}
}
- if (k != i) {
+ if(k != i) {
/* Swap eigenvalues. */
MAT(A, n, k, k) = MAT(A, n, i, i);
MAT(A, n, i, i) = v;
/* Swap eigenvectors. */
- for (int j = 0; j < n; j++) {
+ for(int j = 0; j < n; j++) {
float v = MATS(V, n, i, j, v_stride);
MATS(V, n, i, j, v_stride) = MATS(V, n, k, j, v_stride);
MATS(V, n, k, j, v_stride) = v;
}
#ifdef __KERNEL_SSE3__
-
-ccl_device_inline void math_vector_zero_sse(__m128 *A, int n)
+ccl_device_inline void math_vector_zero_sse(float4 *A, int n)
{
- for(int i = 0; i < n; i++)
- A[i] = _mm_setzero_ps();
+ for(int i = 0; i < n; i++) {
+ A[i] = make_float4(0.0f);
+ }
}
-ccl_device_inline void math_matrix_zero_sse(__m128 *A, int n)
+
+ccl_device_inline void math_matrix_zero_sse(float4 *A, int n)
{
- for(int row = 0; row < n; row++)
- for(int col = 0; col <= row; col++)
- MAT(A, n, row, col) = _mm_setzero_ps();
+ for(int row = 0; row < n; row++) {
+ for(int col = 0; col <= row; col++) {
+ MAT(A, n, row, col) = make_float4(0.0f);
+ }
+ }
}
/* Add Gramian matrix of v to A.
* The Gramian matrix of v is v^T*v, so element (i,j) is v[i]*v[j]. */
-ccl_device_inline void math_matrix_add_gramian_sse(__m128 *A, int n, const __m128 *ccl_restrict v, __m128 weight)
+ccl_device_inline void math_matrix_add_gramian_sse(float4 *A, int n, const float4 *ccl_restrict v, float4 weight)
{
- for(int row = 0; row < n; row++)
- for(int col = 0; col <= row; col++)
- MAT(A, n, row, col) = _mm_add_ps(MAT(A, n, row, col), _mm_mul_ps(_mm_mul_ps(v[row], v[col]), weight));
+ for(int row = 0; row < n; row++) {
+ for(int col = 0; col <= row; col++) {
+ MAT(A, n, row, col) = MAT(A, n, row, col) + v[row] * v[col] * weight;
+ }
+ }
}
-ccl_device_inline void math_vector_add_sse(__m128 *V, int n, const __m128 *ccl_restrict a)
+ccl_device_inline void math_vector_add_sse(float4 *V, int n, const float4 *ccl_restrict a)
{
- for(int i = 0; i < n; i++)
- V[i] = _mm_add_ps(V[i], a[i]);
+ for(int i = 0; i < n; i++) {
+ V[i] += a[i];
+ }
}
-ccl_device_inline void math_vector_mul_sse(__m128 *V, int n, const __m128 *ccl_restrict a)
+ccl_device_inline void math_vector_mul_sse(float4 *V, int n, const float4 *ccl_restrict a)
{
- for(int i = 0; i < n; i++)
- V[i] = _mm_mul_ps(V[i], a[i]);
+ for(int i = 0; i < n; i++) {
+ V[i] *= a[i];
+ }
}
-ccl_device_inline void math_vector_max_sse(__m128 *a, const __m128 *ccl_restrict b, int n)
+ccl_device_inline void math_vector_max_sse(float4 *a, const float4 *ccl_restrict b, int n)
{
- for(int i = 0; i < n; i++)
- a[i] = _mm_max_ps(a[i], b[i]);
+ for(int i = 0; i < n; i++) {
+ a[i] = max(a[i], b[i]);
+ }
}
-ccl_device_inline void math_matrix_hsum(float *A, int n, const __m128 *ccl_restrict B)
+ccl_device_inline void math_matrix_hsum(float *A, int n, const float4 *ccl_restrict B)
{
- for(int row = 0; row < n; row++)
- for(int col = 0; col <= row; col++)
- MAT(A, n, row, col) = _mm_hsum_ss(MAT(B, n, row, col));
+ for(int row = 0; row < n; row++) {
+ for(int col = 0; col <= row; col++) {
+ MAT(A, n, row, col) = reduce_add(MAT(B, n, row, col))[0];
+ }
+ }
}
#endif