SSE vector class for better performance in my CPU-based ray tracer [on hold]
up vote
-3
down vote
favorite
I was creating a vector class using SSE intrinsics for my CPU-based ray tracer. I thought simply replacing regular vector operations will give me good performance gain. I compiled it with visual studio 2017 compiler in x86 and x64 release mode. In x86 it was crashing but In x64 it was working fine but no performance gain.
#ifndef INC_VEC4_H
#define INC_VEC4_H
#include<nmmintrin.h>
_declspec(align(16))
struct vec4
{
union {
__m128 vec;
struct { float x, y, z, w; };
struct { float r, g, b, a; };
};
//normalised directions
static vec4 UP;
static vec4 DOWN;
static vec4 ZERO;
static vec4 LEFT;
static vec4 RIGHT;
static vec4 FORWARD;
static vec4 BACKWARD;
//construction
vec4();
vec4(float x, float y, float z, float w = 1.0f);
explicit vec4(float n);
vec4(const vec4& other);
vec4& operator = (const vec4& other);
//destruction
~vec4();
inline float dot(const vec4& other)
{
__m128 dotResult = _mm_dp_ps(vec, other.vec, 0x7F);
float result;
_mm_store_ss(&result, dotResult);
return result;
}
inline void make_it_unit()
{
__m128 selfDot = _mm_dp_ps(vec, vec, 0x7F);
__m128 sqrtResult = _mm_rsqrt_ps(selfDot);
vec = _mm_mul_ps(vec, sqrtResult);
}
inline vec4 normalize()
{
vec4 result;
__m128 selfDot = _mm_dp_ps(vec, vec, 0x7F);
__m128 sqrtResult = _mm_rsqrt_ps(selfDot);
result.vec = _mm_mul_ps(vec, sqrtResult);
return result;
}
inline float length()
{
__m128 selfDot = _mm_dp_ps(vec, vec, 0x7F);
__m128 sqrtResult = _mm_sqrt_ps(selfDot);
float result;
_mm_store_ss(&result, sqrtResult);
return result;
}
//vec4-vec4 arithmetic operations
inline vec4 operator + (const vec4& other) const {
vec4 result;
result.vec = _mm_add_ps(vec, other.vec);
return result;
}
inline vec4 operator - (const vec4& other) const {
vec4 result;
result.vec = _mm_sub_ps(vec, other.vec);
return result;
}
//vec4-vec4 arithmetic operations
inline void operator += (const vec4& other) {
vec = _mm_add_ps(vec, other.vec);
}
inline vec4 operator * (const vec4& other) const {
vec4 result;
result.vec = _mm_mul_ps(vec, other.vec);
return result;
}
inline vec4 operator / (const vec4& other) const {
vec4 result;
result.vec = _mm_div_ps(vec, other.vec);
return result;
}
inline void operator *= (const vec4& other) {
vec = _mm_mul_ps(vec, other.vec);
}
inline void operator /= (const vec4& other) {
vec = _mm_div_ps(vec, other.vec);
}
//vec4-scalar * & /
inline vec4 operator / (float scalar) const {
vec4 result;
__m128 _scalar = _mm_set_ps(scalar, scalar, scalar, scalar);
result.vec = _mm_div_ps(vec, _scalar);
return result;
}
inline void operator *= (float scalar) {
__m128 _scalar = _mm_set_ps(scalar, scalar, scalar, scalar);
vec = _mm_mul_ps(vec, _scalar);
}
inline void operator /= (float scalar) {
__m128 _scalar = _mm_set_ps(scalar, scalar, scalar, scalar);
vec = _mm_div_ps(vec, _scalar);
}
inline float squared_length() const {
float result;
__m128 dotResult = _mm_dp_ps(vec, vec, 0x7F);
_mm_store_ss(&result, dotResult);
return result;
}
inline vec4& make_itzero() {
vec = _mm_set_ps(0.0f, 0.0f, 0.0f, 0.0f);
}
inline vec4 cross(const vec4& other) const {
vec4 result;
result.vec = _mm_sub_ps(
_mm_mul_ps(_mm_shuffle_ps(vec, vec, _MM_SHUFFLE(3, 0, 2, 1)), _mm_shuffle_ps(other.vec, other.vec, _MM_SHUFFLE(3, 1, 0, 2))),
_mm_mul_ps(_mm_shuffle_ps(vec, vec, _MM_SHUFFLE(3, 1, 0, 2)), _mm_shuffle_ps(other.vec, other.vec, _MM_SHUFFLE(3, 0, 2, 1)))
);
return result;
}
//checks
bool check_ifzero() const {
}
};
//non-member inline operators
inline vec4 operator * (const vec4& v, float scalar) {
vec4 result;
__m128 _scalar = _mm_set_ps(scalar, scalar, scalar, scalar);
result.vec = _mm_mul_ps(v.vec, _scalar);
return result;
}
inline vec4 operator * (float scalar, const vec4& v) {
vec4 result;
__m128 _scalar = _mm_set_ps(scalar, scalar, scalar, scalar);
result.vec = _mm_mul_ps(v.vec, _scalar);
return result;
}
inline float dot(const vec4& v1, const vec4& v2) {
__m128 dotResult = _mm_dp_ps(v1.vec, v2.vec, 0x7F);
float result;
_mm_store_ss(&result, dotResult);
return result;
}
#endif
Can someone explain what's wrong here?
c++ performance coordinate-system sse raytracing
New contributor
put on hold as unclear what you're asking by Toby Speight, πάντα ῥεῖ, Graipher, Edward, Mast Nov 14 at 13:29
Please clarify your specific problem or add additional details to highlight exactly what you need. As it's currently written, it’s hard to tell exactly what you're asking. See the How to Ask page for help clarifying this question. If this question can be reworded to fit the rules in the help center, please edit the question.
add a comment |
up vote
-3
down vote
favorite
I was creating a vector class using SSE intrinsics for my CPU-based ray tracer. I thought simply replacing regular vector operations will give me good performance gain. I compiled it with visual studio 2017 compiler in x86 and x64 release mode. In x86 it was crashing but In x64 it was working fine but no performance gain.
#ifndef INC_VEC4_H
#define INC_VEC4_H
#include<nmmintrin.h>
_declspec(align(16))
struct vec4
{
union {
__m128 vec;
struct { float x, y, z, w; };
struct { float r, g, b, a; };
};
//normalised directions
static vec4 UP;
static vec4 DOWN;
static vec4 ZERO;
static vec4 LEFT;
static vec4 RIGHT;
static vec4 FORWARD;
static vec4 BACKWARD;
//construction
vec4();
vec4(float x, float y, float z, float w = 1.0f);
explicit vec4(float n);
vec4(const vec4& other);
vec4& operator = (const vec4& other);
//destruction
~vec4();
inline float dot(const vec4& other)
{
__m128 dotResult = _mm_dp_ps(vec, other.vec, 0x7F);
float result;
_mm_store_ss(&result, dotResult);
return result;
}
inline void make_it_unit()
{
__m128 selfDot = _mm_dp_ps(vec, vec, 0x7F);
__m128 sqrtResult = _mm_rsqrt_ps(selfDot);
vec = _mm_mul_ps(vec, sqrtResult);
}
inline vec4 normalize()
{
vec4 result;
__m128 selfDot = _mm_dp_ps(vec, vec, 0x7F);
__m128 sqrtResult = _mm_rsqrt_ps(selfDot);
result.vec = _mm_mul_ps(vec, sqrtResult);
return result;
}
inline float length()
{
__m128 selfDot = _mm_dp_ps(vec, vec, 0x7F);
__m128 sqrtResult = _mm_sqrt_ps(selfDot);
float result;
_mm_store_ss(&result, sqrtResult);
return result;
}
//vec4-vec4 arithmetic operations
inline vec4 operator + (const vec4& other) const {
vec4 result;
result.vec = _mm_add_ps(vec, other.vec);
return result;
}
inline vec4 operator - (const vec4& other) const {
vec4 result;
result.vec = _mm_sub_ps(vec, other.vec);
return result;
}
//vec4-vec4 arithmetic operations
inline void operator += (const vec4& other) {
vec = _mm_add_ps(vec, other.vec);
}
inline vec4 operator * (const vec4& other) const {
vec4 result;
result.vec = _mm_mul_ps(vec, other.vec);
return result;
}
inline vec4 operator / (const vec4& other) const {
vec4 result;
result.vec = _mm_div_ps(vec, other.vec);
return result;
}
inline void operator *= (const vec4& other) {
vec = _mm_mul_ps(vec, other.vec);
}
inline void operator /= (const vec4& other) {
vec = _mm_div_ps(vec, other.vec);
}
//vec4-scalar * & /
inline vec4 operator / (float scalar) const {
vec4 result;
__m128 _scalar = _mm_set_ps(scalar, scalar, scalar, scalar);
result.vec = _mm_div_ps(vec, _scalar);
return result;
}
inline void operator *= (float scalar) {
__m128 _scalar = _mm_set_ps(scalar, scalar, scalar, scalar);
vec = _mm_mul_ps(vec, _scalar);
}
inline void operator /= (float scalar) {
__m128 _scalar = _mm_set_ps(scalar, scalar, scalar, scalar);
vec = _mm_div_ps(vec, _scalar);
}
inline float squared_length() const {
float result;
__m128 dotResult = _mm_dp_ps(vec, vec, 0x7F);
_mm_store_ss(&result, dotResult);
return result;
}
inline vec4& make_itzero() {
vec = _mm_set_ps(0.0f, 0.0f, 0.0f, 0.0f);
}
inline vec4 cross(const vec4& other) const {
vec4 result;
result.vec = _mm_sub_ps(
_mm_mul_ps(_mm_shuffle_ps(vec, vec, _MM_SHUFFLE(3, 0, 2, 1)), _mm_shuffle_ps(other.vec, other.vec, _MM_SHUFFLE(3, 1, 0, 2))),
_mm_mul_ps(_mm_shuffle_ps(vec, vec, _MM_SHUFFLE(3, 1, 0, 2)), _mm_shuffle_ps(other.vec, other.vec, _MM_SHUFFLE(3, 0, 2, 1)))
);
return result;
}
//checks
bool check_ifzero() const {
}
};
//non-member inline operators
inline vec4 operator * (const vec4& v, float scalar) {
vec4 result;
__m128 _scalar = _mm_set_ps(scalar, scalar, scalar, scalar);
result.vec = _mm_mul_ps(v.vec, _scalar);
return result;
}
inline vec4 operator * (float scalar, const vec4& v) {
vec4 result;
__m128 _scalar = _mm_set_ps(scalar, scalar, scalar, scalar);
result.vec = _mm_mul_ps(v.vec, _scalar);
return result;
}
inline float dot(const vec4& v1, const vec4& v2) {
__m128 dotResult = _mm_dp_ps(v1.vec, v2.vec, 0x7F);
float result;
_mm_store_ss(&result, dotResult);
return result;
}
#endif
Can someone explain what's wrong here?
c++ performance coordinate-system sse raytracing
New contributor
put on hold as unclear what you're asking by Toby Speight, πάντα ῥεῖ, Graipher, Edward, Mast Nov 14 at 13:29
Please clarify your specific problem or add additional details to highlight exactly what you need. As it's currently written, it’s hard to tell exactly what you're asking. See the How to Ask page for help clarifying this question. If this question can be reworded to fit the rules in the help center, please edit the question.
What happened tocheck_ifzero()
?
– 200_success
Nov 14 at 13:28
add a comment |
up vote
-3
down vote
favorite
up vote
-3
down vote
favorite
I was creating a vector class using SSE intrinsics for my CPU-based ray tracer. I thought simply replacing regular vector operations will give me good performance gain. I compiled it with visual studio 2017 compiler in x86 and x64 release mode. In x86 it was crashing but In x64 it was working fine but no performance gain.
#ifndef INC_VEC4_H
#define INC_VEC4_H
#include<nmmintrin.h>
_declspec(align(16))
struct vec4
{
union {
__m128 vec;
struct { float x, y, z, w; };
struct { float r, g, b, a; };
};
//normalised directions
static vec4 UP;
static vec4 DOWN;
static vec4 ZERO;
static vec4 LEFT;
static vec4 RIGHT;
static vec4 FORWARD;
static vec4 BACKWARD;
//construction
vec4();
vec4(float x, float y, float z, float w = 1.0f);
explicit vec4(float n);
vec4(const vec4& other);
vec4& operator = (const vec4& other);
//destruction
~vec4();
inline float dot(const vec4& other)
{
__m128 dotResult = _mm_dp_ps(vec, other.vec, 0x7F);
float result;
_mm_store_ss(&result, dotResult);
return result;
}
inline void make_it_unit()
{
__m128 selfDot = _mm_dp_ps(vec, vec, 0x7F);
__m128 sqrtResult = _mm_rsqrt_ps(selfDot);
vec = _mm_mul_ps(vec, sqrtResult);
}
inline vec4 normalize()
{
vec4 result;
__m128 selfDot = _mm_dp_ps(vec, vec, 0x7F);
__m128 sqrtResult = _mm_rsqrt_ps(selfDot);
result.vec = _mm_mul_ps(vec, sqrtResult);
return result;
}
inline float length()
{
__m128 selfDot = _mm_dp_ps(vec, vec, 0x7F);
__m128 sqrtResult = _mm_sqrt_ps(selfDot);
float result;
_mm_store_ss(&result, sqrtResult);
return result;
}
//vec4-vec4 arithmetic operations
inline vec4 operator + (const vec4& other) const {
vec4 result;
result.vec = _mm_add_ps(vec, other.vec);
return result;
}
inline vec4 operator - (const vec4& other) const {
vec4 result;
result.vec = _mm_sub_ps(vec, other.vec);
return result;
}
//vec4-vec4 arithmetic operations
inline void operator += (const vec4& other) {
vec = _mm_add_ps(vec, other.vec);
}
inline vec4 operator * (const vec4& other) const {
vec4 result;
result.vec = _mm_mul_ps(vec, other.vec);
return result;
}
inline vec4 operator / (const vec4& other) const {
vec4 result;
result.vec = _mm_div_ps(vec, other.vec);
return result;
}
inline void operator *= (const vec4& other) {
vec = _mm_mul_ps(vec, other.vec);
}
inline void operator /= (const vec4& other) {
vec = _mm_div_ps(vec, other.vec);
}
//vec4-scalar * & /
inline vec4 operator / (float scalar) const {
vec4 result;
__m128 _scalar = _mm_set_ps(scalar, scalar, scalar, scalar);
result.vec = _mm_div_ps(vec, _scalar);
return result;
}
inline void operator *= (float scalar) {
__m128 _scalar = _mm_set_ps(scalar, scalar, scalar, scalar);
vec = _mm_mul_ps(vec, _scalar);
}
inline void operator /= (float scalar) {
__m128 _scalar = _mm_set_ps(scalar, scalar, scalar, scalar);
vec = _mm_div_ps(vec, _scalar);
}
inline float squared_length() const {
float result;
__m128 dotResult = _mm_dp_ps(vec, vec, 0x7F);
_mm_store_ss(&result, dotResult);
return result;
}
inline vec4& make_itzero() {
vec = _mm_set_ps(0.0f, 0.0f, 0.0f, 0.0f);
}
inline vec4 cross(const vec4& other) const {
vec4 result;
result.vec = _mm_sub_ps(
_mm_mul_ps(_mm_shuffle_ps(vec, vec, _MM_SHUFFLE(3, 0, 2, 1)), _mm_shuffle_ps(other.vec, other.vec, _MM_SHUFFLE(3, 1, 0, 2))),
_mm_mul_ps(_mm_shuffle_ps(vec, vec, _MM_SHUFFLE(3, 1, 0, 2)), _mm_shuffle_ps(other.vec, other.vec, _MM_SHUFFLE(3, 0, 2, 1)))
);
return result;
}
//checks
bool check_ifzero() const {
}
};
//non-member inline operators
inline vec4 operator * (const vec4& v, float scalar) {
vec4 result;
__m128 _scalar = _mm_set_ps(scalar, scalar, scalar, scalar);
result.vec = _mm_mul_ps(v.vec, _scalar);
return result;
}
inline vec4 operator * (float scalar, const vec4& v) {
vec4 result;
__m128 _scalar = _mm_set_ps(scalar, scalar, scalar, scalar);
result.vec = _mm_mul_ps(v.vec, _scalar);
return result;
}
inline float dot(const vec4& v1, const vec4& v2) {
__m128 dotResult = _mm_dp_ps(v1.vec, v2.vec, 0x7F);
float result;
_mm_store_ss(&result, dotResult);
return result;
}
#endif
Can someone explain what's wrong here?
c++ performance coordinate-system sse raytracing
New contributor
I was creating a vector class using SSE intrinsics for my CPU-based ray tracer. I thought simply replacing regular vector operations will give me good performance gain. I compiled it with visual studio 2017 compiler in x86 and x64 release mode. In x86 it was crashing but In x64 it was working fine but no performance gain.
#ifndef INC_VEC4_H
#define INC_VEC4_H
#include<nmmintrin.h>
_declspec(align(16))
struct vec4
{
union {
__m128 vec;
struct { float x, y, z, w; };
struct { float r, g, b, a; };
};
//normalised directions
static vec4 UP;
static vec4 DOWN;
static vec4 ZERO;
static vec4 LEFT;
static vec4 RIGHT;
static vec4 FORWARD;
static vec4 BACKWARD;
//construction
vec4();
vec4(float x, float y, float z, float w = 1.0f);
explicit vec4(float n);
vec4(const vec4& other);
vec4& operator = (const vec4& other);
//destruction
~vec4();
inline float dot(const vec4& other)
{
__m128 dotResult = _mm_dp_ps(vec, other.vec, 0x7F);
float result;
_mm_store_ss(&result, dotResult);
return result;
}
inline void make_it_unit()
{
__m128 selfDot = _mm_dp_ps(vec, vec, 0x7F);
__m128 sqrtResult = _mm_rsqrt_ps(selfDot);
vec = _mm_mul_ps(vec, sqrtResult);
}
inline vec4 normalize()
{
vec4 result;
__m128 selfDot = _mm_dp_ps(vec, vec, 0x7F);
__m128 sqrtResult = _mm_rsqrt_ps(selfDot);
result.vec = _mm_mul_ps(vec, sqrtResult);
return result;
}
inline float length()
{
__m128 selfDot = _mm_dp_ps(vec, vec, 0x7F);
__m128 sqrtResult = _mm_sqrt_ps(selfDot);
float result;
_mm_store_ss(&result, sqrtResult);
return result;
}
//vec4-vec4 arithmetic operations
inline vec4 operator + (const vec4& other) const {
vec4 result;
result.vec = _mm_add_ps(vec, other.vec);
return result;
}
inline vec4 operator - (const vec4& other) const {
vec4 result;
result.vec = _mm_sub_ps(vec, other.vec);
return result;
}
//vec4-vec4 arithmetic operations
inline void operator += (const vec4& other) {
vec = _mm_add_ps(vec, other.vec);
}
inline vec4 operator * (const vec4& other) const {
vec4 result;
result.vec = _mm_mul_ps(vec, other.vec);
return result;
}
inline vec4 operator / (const vec4& other) const {
vec4 result;
result.vec = _mm_div_ps(vec, other.vec);
return result;
}
inline void operator *= (const vec4& other) {
vec = _mm_mul_ps(vec, other.vec);
}
inline void operator /= (const vec4& other) {
vec = _mm_div_ps(vec, other.vec);
}
//vec4-scalar * & /
inline vec4 operator / (float scalar) const {
vec4 result;
__m128 _scalar = _mm_set_ps(scalar, scalar, scalar, scalar);
result.vec = _mm_div_ps(vec, _scalar);
return result;
}
inline void operator *= (float scalar) {
__m128 _scalar = _mm_set_ps(scalar, scalar, scalar, scalar);
vec = _mm_mul_ps(vec, _scalar);
}
inline void operator /= (float scalar) {
__m128 _scalar = _mm_set_ps(scalar, scalar, scalar, scalar);
vec = _mm_div_ps(vec, _scalar);
}
inline float squared_length() const {
float result;
__m128 dotResult = _mm_dp_ps(vec, vec, 0x7F);
_mm_store_ss(&result, dotResult);
return result;
}
inline vec4& make_itzero() {
vec = _mm_set_ps(0.0f, 0.0f, 0.0f, 0.0f);
}
inline vec4 cross(const vec4& other) const {
vec4 result;
result.vec = _mm_sub_ps(
_mm_mul_ps(_mm_shuffle_ps(vec, vec, _MM_SHUFFLE(3, 0, 2, 1)), _mm_shuffle_ps(other.vec, other.vec, _MM_SHUFFLE(3, 1, 0, 2))),
_mm_mul_ps(_mm_shuffle_ps(vec, vec, _MM_SHUFFLE(3, 1, 0, 2)), _mm_shuffle_ps(other.vec, other.vec, _MM_SHUFFLE(3, 0, 2, 1)))
);
return result;
}
//checks
bool check_ifzero() const {
}
};
//non-member inline operators
inline vec4 operator * (const vec4& v, float scalar) {
vec4 result;
__m128 _scalar = _mm_set_ps(scalar, scalar, scalar, scalar);
result.vec = _mm_mul_ps(v.vec, _scalar);
return result;
}
inline vec4 operator * (float scalar, const vec4& v) {
vec4 result;
__m128 _scalar = _mm_set_ps(scalar, scalar, scalar, scalar);
result.vec = _mm_mul_ps(v.vec, _scalar);
return result;
}
inline float dot(const vec4& v1, const vec4& v2) {
__m128 dotResult = _mm_dp_ps(v1.vec, v2.vec, 0x7F);
float result;
_mm_store_ss(&result, dotResult);
return result;
}
#endif
Can someone explain what's wrong here?
c++ performance coordinate-system sse raytracing
c++ performance coordinate-system sse raytracing
New contributor
New contributor
edited Nov 14 at 13:27
200_success
127k15148410
127k15148410
New contributor
asked Nov 14 at 10:07
Ankit Singh
11
11
New contributor
New contributor
put on hold as unclear what you're asking by Toby Speight, πάντα ῥεῖ, Graipher, Edward, Mast Nov 14 at 13:29
Please clarify your specific problem or add additional details to highlight exactly what you need. As it's currently written, it’s hard to tell exactly what you're asking. See the How to Ask page for help clarifying this question. If this question can be reworded to fit the rules in the help center, please edit the question.
put on hold as unclear what you're asking by Toby Speight, πάντα ῥεῖ, Graipher, Edward, Mast Nov 14 at 13:29
Please clarify your specific problem or add additional details to highlight exactly what you need. As it's currently written, it’s hard to tell exactly what you're asking. See the How to Ask page for help clarifying this question. If this question can be reworded to fit the rules in the help center, please edit the question.
What happened tocheck_ifzero()
?
– 200_success
Nov 14 at 13:28
add a comment |
What happened tocheck_ifzero()
?
– 200_success
Nov 14 at 13:28
What happened to
check_ifzero()
?– 200_success
Nov 14 at 13:28
What happened to
check_ifzero()
?– 200_success
Nov 14 at 13:28
add a comment |
active
oldest
votes
active
oldest
votes
active
oldest
votes
active
oldest
votes
active
oldest
votes
What happened to
check_ifzero()
?– 200_success
Nov 14 at 13:28