\0;115;0cCycles: Cleanup, use ccl_restrict instead of ccl_restrict_ptr
[blender.git] / intern / cycles / kernel / filter / filter_features_sse.h
1 /*
2  * Copyright 2011-2017 Blender Foundation
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16
17 CCL_NAMESPACE_BEGIN
18
19 #define ccl_get_feature_sse(pass) _mm_loadu_ps(buffer + (pass)*pass_stride)
20
21 /* Loop over the pixels in the range [low.x, high.x) x [low.y, high.y), 4 at a time.
22  * pixel_buffer always points to the first of the 4 current pixel in the first pass.
23  * x4 and y4 contain the coordinates of the four pixels, active_pixels contains a mask that's set for all pixels within the window. */
24
25 #define FOR_PIXEL_WINDOW_SSE     pixel_buffer = buffer + (low.y - rect.y)*buffer_w + (low.x - rect.x); \
26                                  for(pixel.y = low.y; pixel.y < high.y; pixel.y++) { \
27                                      __m128 y4 = _mm_set1_ps(pixel.y); \
28                                      for(pixel.x = low.x; pixel.x < high.x; pixel.x += 4, pixel_buffer += 4) { \
29                                          __m128 x4 = _mm_add_ps(_mm_set1_ps(pixel.x), _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); \
30                                          __m128 active_pixels = _mm_cmplt_ps(x4, _mm_set1_ps(high.x));
31
32 #define END_FOR_PIXEL_WINDOW_SSE     } \
33                                      pixel_buffer += buffer_w - (pixel.x - low.x); \
34                                  }
35
36 ccl_device_inline void filter_get_features_sse(__m128 x, __m128 y,
37                                                __m128 active_pixels,
38                                                const float *ccl_restrict buffer,
39                                                __m128 *features,
40                                                const __m128 ccl_restrict *mean,
41                                                int pass_stride)
42 {
43         features[0] = x;
44         features[1] = y;
45         features[2] = _mm_fabs_ps(ccl_get_feature_sse(0));
46         features[3] = ccl_get_feature_sse(1);
47         features[4] = ccl_get_feature_sse(2);
48         features[5] = ccl_get_feature_sse(3);
49         features[6] = ccl_get_feature_sse(4);
50         features[7] = ccl_get_feature_sse(5);
51         features[8] = ccl_get_feature_sse(6);
52         features[9] = ccl_get_feature_sse(7);
53         if(mean) {
54                 for(int i = 0; i < DENOISE_FEATURES; i++)
55                         features[i] = _mm_sub_ps(features[i], mean[i]);
56         }
57         for(int i = 0; i < DENOISE_FEATURES; i++)
58                 features[i] = _mm_mask_ps(features[i], active_pixels);
59 }
60
61 ccl_device_inline void filter_get_feature_scales_sse(__m128 x, __m128 y,
62                                                      __m128 active_pixels,
63                                                      const float *ccl_restrict buffer,
64                                                      __m128 *scales,
65                                                      const __m128 *ccl_restrict mean,
66                                                      int pass_stride)
67 {
68         scales[0] = _mm_mask_ps(_mm_fabs_ps(_mm_sub_ps(x, mean[0])), active_pixels);
69         scales[1] = _mm_mask_ps(_mm_fabs_ps(_mm_sub_ps(y, mean[1])), active_pixels);
70
71         scales[2] = _mm_mask_ps(_mm_fabs_ps(_mm_sub_ps(_mm_fabs_ps(ccl_get_feature_sse(0)), mean[2])), active_pixels);
72
73         __m128 diff, scale;
74         diff = _mm_sub_ps(ccl_get_feature_sse(1), mean[3]);
75         scale = _mm_mul_ps(diff, diff);
76         diff = _mm_sub_ps(ccl_get_feature_sse(2), mean[4]);
77         scale = _mm_add_ps(scale, _mm_mul_ps(diff, diff));
78         diff = _mm_sub_ps(ccl_get_feature_sse(3), mean[5]);
79         scale = _mm_add_ps(scale, _mm_mul_ps(diff, diff));
80         scales[3] = _mm_mask_ps(scale, active_pixels);
81
82         scales[4] = _mm_mask_ps(_mm_fabs_ps(_mm_sub_ps(ccl_get_feature_sse(4), mean[6])), active_pixels);
83
84         diff = _mm_sub_ps(ccl_get_feature_sse(5), mean[7]);
85         scale = _mm_mul_ps(diff, diff);
86         diff = _mm_sub_ps(ccl_get_feature_sse(6), mean[8]);
87         scale = _mm_add_ps(scale, _mm_mul_ps(diff, diff));
88         diff = _mm_sub_ps(ccl_get_feature_sse(7), mean[9]);
89         scale = _mm_add_ps(scale, _mm_mul_ps(diff, diff));
90         scales[5] = _mm_mask_ps(scale, active_pixels);
91 }
92
93 ccl_device_inline void filter_calculate_scale_sse(__m128 *scale)
94 {
95         scale[0] = _mm_rcp_ps(_mm_max_ps(_mm_hmax_ps(scale[0]), _mm_set1_ps(0.01f)));
96         scale[1] = _mm_rcp_ps(_mm_max_ps(_mm_hmax_ps(scale[1]), _mm_set1_ps(0.01f)));
97         scale[2] = _mm_rcp_ps(_mm_max_ps(_mm_hmax_ps(scale[2]), _mm_set1_ps(0.01f)));
98         scale[6] = _mm_rcp_ps(_mm_max_ps(_mm_hmax_ps(scale[4]), _mm_set1_ps(0.01f)));
99
100         scale[7] = scale[8] = scale[9] = _mm_rcp_ps(_mm_max_ps(_mm_hmax_ps(_mm_sqrt_ps(scale[5])), _mm_set1_ps(0.01f)));
101         scale[3] = scale[4] = scale[5] = _mm_rcp_ps(_mm_max_ps(_mm_hmax_ps(_mm_sqrt_ps(scale[3])), _mm_set1_ps(0.01f)));
102 }
103
104
105 CCL_NAMESPACE_END