SSE vector class for better performance in my CPU-based ray tracer [on hold]











up vote
-3
down vote

favorite












I was creating a vector class using SSE intrinsics for my CPU-based ray tracer. I thought simply replacing regular vector operations will give me good performance gain. I compiled it with visual studio 2017 compiler in x86 and x64 release mode. In x86 it was crashing but In x64 it was working fine but no performance gain.



#ifndef INC_VEC4_H
#define INC_VEC4_H

#include<nmmintrin.h>

_declspec(align(16))
struct vec4
{
union {
__m128 vec;
struct { float x, y, z, w; };
struct { float r, g, b, a; };
};

//normalised directions
static vec4 UP;
static vec4 DOWN;
static vec4 ZERO;
static vec4 LEFT;
static vec4 RIGHT;
static vec4 FORWARD;
static vec4 BACKWARD;

//construction
vec4();
vec4(float x, float y, float z, float w = 1.0f);
explicit vec4(float n);
vec4(const vec4& other);
vec4& operator = (const vec4& other);
//destruction
~vec4();

inline float dot(const vec4& other)
{
__m128 dotResult = _mm_dp_ps(vec, other.vec, 0x7F);
float result;
_mm_store_ss(&result, dotResult);
return result;
}

inline void make_it_unit()
{
__m128 selfDot = _mm_dp_ps(vec, vec, 0x7F);
__m128 sqrtResult = _mm_rsqrt_ps(selfDot);
vec = _mm_mul_ps(vec, sqrtResult);
}

inline vec4 normalize()
{
vec4 result;
__m128 selfDot = _mm_dp_ps(vec, vec, 0x7F);
__m128 sqrtResult = _mm_rsqrt_ps(selfDot);
result.vec = _mm_mul_ps(vec, sqrtResult);
return result;
}

inline float length()
{
__m128 selfDot = _mm_dp_ps(vec, vec, 0x7F);
__m128 sqrtResult = _mm_sqrt_ps(selfDot);
float result;
_mm_store_ss(&result, sqrtResult);
return result;
}

//vec4-vec4 arithmetic operations
inline vec4 operator + (const vec4& other) const {
vec4 result;
result.vec = _mm_add_ps(vec, other.vec);
return result;
}
inline vec4 operator - (const vec4& other) const {
vec4 result;
result.vec = _mm_sub_ps(vec, other.vec);
return result;
}

//vec4-vec4 arithmetic operations
inline void operator += (const vec4& other) {
vec = _mm_add_ps(vec, other.vec);
}


inline vec4 operator * (const vec4& other) const {
vec4 result;
result.vec = _mm_mul_ps(vec, other.vec);
return result;
}
inline vec4 operator / (const vec4& other) const {
vec4 result;
result.vec = _mm_div_ps(vec, other.vec);
return result;
}
inline void operator *= (const vec4& other) {
vec = _mm_mul_ps(vec, other.vec);
}
inline void operator /= (const vec4& other) {
vec = _mm_div_ps(vec, other.vec);
}
//vec4-scalar * & /

inline vec4 operator / (float scalar) const {
vec4 result;
__m128 _scalar = _mm_set_ps(scalar, scalar, scalar, scalar);
result.vec = _mm_div_ps(vec, _scalar);
return result;
}
inline void operator *= (float scalar) {
__m128 _scalar = _mm_set_ps(scalar, scalar, scalar, scalar);
vec = _mm_mul_ps(vec, _scalar);
}
inline void operator /= (float scalar) {
__m128 _scalar = _mm_set_ps(scalar, scalar, scalar, scalar);
vec = _mm_div_ps(vec, _scalar);
}



inline float squared_length() const {
float result;
__m128 dotResult = _mm_dp_ps(vec, vec, 0x7F);
_mm_store_ss(&result, dotResult);
return result;
}

inline vec4& make_itzero() {
vec = _mm_set_ps(0.0f, 0.0f, 0.0f, 0.0f);
}


inline vec4 cross(const vec4& other) const {
vec4 result;
result.vec = _mm_sub_ps(
_mm_mul_ps(_mm_shuffle_ps(vec, vec, _MM_SHUFFLE(3, 0, 2, 1)), _mm_shuffle_ps(other.vec, other.vec, _MM_SHUFFLE(3, 1, 0, 2))),
_mm_mul_ps(_mm_shuffle_ps(vec, vec, _MM_SHUFFLE(3, 1, 0, 2)), _mm_shuffle_ps(other.vec, other.vec, _MM_SHUFFLE(3, 0, 2, 1)))
);
return result;
}

//checks
bool check_ifzero() const {

}
};

//non-member inline operators
inline vec4 operator * (const vec4& v, float scalar) {
vec4 result;
__m128 _scalar = _mm_set_ps(scalar, scalar, scalar, scalar);
result.vec = _mm_mul_ps(v.vec, _scalar);
return result;
}

inline vec4 operator * (float scalar, const vec4& v) {
vec4 result;
__m128 _scalar = _mm_set_ps(scalar, scalar, scalar, scalar);
result.vec = _mm_mul_ps(v.vec, _scalar);
return result;
}

inline float dot(const vec4& v1, const vec4& v2) {
__m128 dotResult = _mm_dp_ps(v1.vec, v2.vec, 0x7F);
float result;
_mm_store_ss(&result, dotResult);
return result;
}

#endif


Can someone explain what's wrong here?










share|improve this question









New contributor




Ankit Singh is a new contributor to this site. Take care in asking for clarification, commenting, and answering.
Check out our Code of Conduct.











put on hold as unclear what you're asking by Toby Speight, πάντα ῥεῖ, Graipher, Edward, Mast Nov 14 at 13:29


Please clarify your specific problem or add additional details to highlight exactly what you need. As it's currently written, it’s hard to tell exactly what you're asking. See the How to Ask page for help clarifying this question. If this question can be reworded to fit the rules in the help center, please edit the question.















  • What happened to check_ifzero()?
    – 200_success
    Nov 14 at 13:28















up vote
-3
down vote

favorite












I was creating a vector class using SSE intrinsics for my CPU-based ray tracer. I thought simply replacing regular vector operations will give me good performance gain. I compiled it with visual studio 2017 compiler in x86 and x64 release mode. In x86 it was crashing but In x64 it was working fine but no performance gain.



#ifndef INC_VEC4_H
#define INC_VEC4_H

#include<nmmintrin.h>

_declspec(align(16))
struct vec4
{
union {
__m128 vec;
struct { float x, y, z, w; };
struct { float r, g, b, a; };
};

//normalised directions
static vec4 UP;
static vec4 DOWN;
static vec4 ZERO;
static vec4 LEFT;
static vec4 RIGHT;
static vec4 FORWARD;
static vec4 BACKWARD;

//construction
vec4();
vec4(float x, float y, float z, float w = 1.0f);
explicit vec4(float n);
vec4(const vec4& other);
vec4& operator = (const vec4& other);
//destruction
~vec4();

inline float dot(const vec4& other)
{
__m128 dotResult = _mm_dp_ps(vec, other.vec, 0x7F);
float result;
_mm_store_ss(&result, dotResult);
return result;
}

inline void make_it_unit()
{
__m128 selfDot = _mm_dp_ps(vec, vec, 0x7F);
__m128 sqrtResult = _mm_rsqrt_ps(selfDot);
vec = _mm_mul_ps(vec, sqrtResult);
}

inline vec4 normalize()
{
vec4 result;
__m128 selfDot = _mm_dp_ps(vec, vec, 0x7F);
__m128 sqrtResult = _mm_rsqrt_ps(selfDot);
result.vec = _mm_mul_ps(vec, sqrtResult);
return result;
}

inline float length()
{
__m128 selfDot = _mm_dp_ps(vec, vec, 0x7F);
__m128 sqrtResult = _mm_sqrt_ps(selfDot);
float result;
_mm_store_ss(&result, sqrtResult);
return result;
}

//vec4-vec4 arithmetic operations
inline vec4 operator + (const vec4& other) const {
vec4 result;
result.vec = _mm_add_ps(vec, other.vec);
return result;
}
inline vec4 operator - (const vec4& other) const {
vec4 result;
result.vec = _mm_sub_ps(vec, other.vec);
return result;
}

//vec4-vec4 arithmetic operations
inline void operator += (const vec4& other) {
vec = _mm_add_ps(vec, other.vec);
}


inline vec4 operator * (const vec4& other) const {
vec4 result;
result.vec = _mm_mul_ps(vec, other.vec);
return result;
}
inline vec4 operator / (const vec4& other) const {
vec4 result;
result.vec = _mm_div_ps(vec, other.vec);
return result;
}
inline void operator *= (const vec4& other) {
vec = _mm_mul_ps(vec, other.vec);
}
inline void operator /= (const vec4& other) {
vec = _mm_div_ps(vec, other.vec);
}
//vec4-scalar * & /

inline vec4 operator / (float scalar) const {
vec4 result;
__m128 _scalar = _mm_set_ps(scalar, scalar, scalar, scalar);
result.vec = _mm_div_ps(vec, _scalar);
return result;
}
inline void operator *= (float scalar) {
__m128 _scalar = _mm_set_ps(scalar, scalar, scalar, scalar);
vec = _mm_mul_ps(vec, _scalar);
}
inline void operator /= (float scalar) {
__m128 _scalar = _mm_set_ps(scalar, scalar, scalar, scalar);
vec = _mm_div_ps(vec, _scalar);
}



inline float squared_length() const {
float result;
__m128 dotResult = _mm_dp_ps(vec, vec, 0x7F);
_mm_store_ss(&result, dotResult);
return result;
}

inline vec4& make_itzero() {
vec = _mm_set_ps(0.0f, 0.0f, 0.0f, 0.0f);
}


inline vec4 cross(const vec4& other) const {
vec4 result;
result.vec = _mm_sub_ps(
_mm_mul_ps(_mm_shuffle_ps(vec, vec, _MM_SHUFFLE(3, 0, 2, 1)), _mm_shuffle_ps(other.vec, other.vec, _MM_SHUFFLE(3, 1, 0, 2))),
_mm_mul_ps(_mm_shuffle_ps(vec, vec, _MM_SHUFFLE(3, 1, 0, 2)), _mm_shuffle_ps(other.vec, other.vec, _MM_SHUFFLE(3, 0, 2, 1)))
);
return result;
}

//checks
bool check_ifzero() const {

}
};

//non-member inline operators
inline vec4 operator * (const vec4& v, float scalar) {
vec4 result;
__m128 _scalar = _mm_set_ps(scalar, scalar, scalar, scalar);
result.vec = _mm_mul_ps(v.vec, _scalar);
return result;
}

inline vec4 operator * (float scalar, const vec4& v) {
vec4 result;
__m128 _scalar = _mm_set_ps(scalar, scalar, scalar, scalar);
result.vec = _mm_mul_ps(v.vec, _scalar);
return result;
}

inline float dot(const vec4& v1, const vec4& v2) {
__m128 dotResult = _mm_dp_ps(v1.vec, v2.vec, 0x7F);
float result;
_mm_store_ss(&result, dotResult);
return result;
}

#endif


Can someone explain what's wrong here?










share|improve this question









New contributor




Ankit Singh is a new contributor to this site. Take care in asking for clarification, commenting, and answering.
Check out our Code of Conduct.











put on hold as unclear what you're asking by Toby Speight, πάντα ῥεῖ, Graipher, Edward, Mast Nov 14 at 13:29


Please clarify your specific problem or add additional details to highlight exactly what you need. As it's currently written, it’s hard to tell exactly what you're asking. See the How to Ask page for help clarifying this question. If this question can be reworded to fit the rules in the help center, please edit the question.















  • What happened to check_ifzero()?
    – 200_success
    Nov 14 at 13:28













up vote
-3
down vote

favorite









up vote
-3
down vote

favorite











I was creating a vector class using SSE intrinsics for my CPU-based ray tracer. I thought simply replacing regular vector operations will give me good performance gain. I compiled it with visual studio 2017 compiler in x86 and x64 release mode. In x86 it was crashing but In x64 it was working fine but no performance gain.



#ifndef INC_VEC4_H
#define INC_VEC4_H

#include<nmmintrin.h>

_declspec(align(16))
struct vec4
{
union {
__m128 vec;
struct { float x, y, z, w; };
struct { float r, g, b, a; };
};

//normalised directions
static vec4 UP;
static vec4 DOWN;
static vec4 ZERO;
static vec4 LEFT;
static vec4 RIGHT;
static vec4 FORWARD;
static vec4 BACKWARD;

//construction
vec4();
vec4(float x, float y, float z, float w = 1.0f);
explicit vec4(float n);
vec4(const vec4& other);
vec4& operator = (const vec4& other);
//destruction
~vec4();

inline float dot(const vec4& other)
{
__m128 dotResult = _mm_dp_ps(vec, other.vec, 0x7F);
float result;
_mm_store_ss(&result, dotResult);
return result;
}

inline void make_it_unit()
{
__m128 selfDot = _mm_dp_ps(vec, vec, 0x7F);
__m128 sqrtResult = _mm_rsqrt_ps(selfDot);
vec = _mm_mul_ps(vec, sqrtResult);
}

inline vec4 normalize()
{
vec4 result;
__m128 selfDot = _mm_dp_ps(vec, vec, 0x7F);
__m128 sqrtResult = _mm_rsqrt_ps(selfDot);
result.vec = _mm_mul_ps(vec, sqrtResult);
return result;
}

inline float length()
{
__m128 selfDot = _mm_dp_ps(vec, vec, 0x7F);
__m128 sqrtResult = _mm_sqrt_ps(selfDot);
float result;
_mm_store_ss(&result, sqrtResult);
return result;
}

//vec4-vec4 arithmetic operations
inline vec4 operator + (const vec4& other) const {
vec4 result;
result.vec = _mm_add_ps(vec, other.vec);
return result;
}
inline vec4 operator - (const vec4& other) const {
vec4 result;
result.vec = _mm_sub_ps(vec, other.vec);
return result;
}

//vec4-vec4 arithmetic operations
inline void operator += (const vec4& other) {
vec = _mm_add_ps(vec, other.vec);
}


inline vec4 operator * (const vec4& other) const {
vec4 result;
result.vec = _mm_mul_ps(vec, other.vec);
return result;
}
inline vec4 operator / (const vec4& other) const {
vec4 result;
result.vec = _mm_div_ps(vec, other.vec);
return result;
}
inline void operator *= (const vec4& other) {
vec = _mm_mul_ps(vec, other.vec);
}
inline void operator /= (const vec4& other) {
vec = _mm_div_ps(vec, other.vec);
}
//vec4-scalar * & /

inline vec4 operator / (float scalar) const {
vec4 result;
__m128 _scalar = _mm_set_ps(scalar, scalar, scalar, scalar);
result.vec = _mm_div_ps(vec, _scalar);
return result;
}
inline void operator *= (float scalar) {
__m128 _scalar = _mm_set_ps(scalar, scalar, scalar, scalar);
vec = _mm_mul_ps(vec, _scalar);
}
inline void operator /= (float scalar) {
__m128 _scalar = _mm_set_ps(scalar, scalar, scalar, scalar);
vec = _mm_div_ps(vec, _scalar);
}



inline float squared_length() const {
float result;
__m128 dotResult = _mm_dp_ps(vec, vec, 0x7F);
_mm_store_ss(&result, dotResult);
return result;
}

inline vec4& make_itzero() {
vec = _mm_set_ps(0.0f, 0.0f, 0.0f, 0.0f);
}


inline vec4 cross(const vec4& other) const {
vec4 result;
result.vec = _mm_sub_ps(
_mm_mul_ps(_mm_shuffle_ps(vec, vec, _MM_SHUFFLE(3, 0, 2, 1)), _mm_shuffle_ps(other.vec, other.vec, _MM_SHUFFLE(3, 1, 0, 2))),
_mm_mul_ps(_mm_shuffle_ps(vec, vec, _MM_SHUFFLE(3, 1, 0, 2)), _mm_shuffle_ps(other.vec, other.vec, _MM_SHUFFLE(3, 0, 2, 1)))
);
return result;
}

//checks
bool check_ifzero() const {

}
};

//non-member inline operators
inline vec4 operator * (const vec4& v, float scalar) {
vec4 result;
__m128 _scalar = _mm_set_ps(scalar, scalar, scalar, scalar);
result.vec = _mm_mul_ps(v.vec, _scalar);
return result;
}

inline vec4 operator * (float scalar, const vec4& v) {
vec4 result;
__m128 _scalar = _mm_set_ps(scalar, scalar, scalar, scalar);
result.vec = _mm_mul_ps(v.vec, _scalar);
return result;
}

inline float dot(const vec4& v1, const vec4& v2) {
__m128 dotResult = _mm_dp_ps(v1.vec, v2.vec, 0x7F);
float result;
_mm_store_ss(&result, dotResult);
return result;
}

#endif


Can someone explain what's wrong here?










share|improve this question









New contributor




Ankit Singh is a new contributor to this site. Take care in asking for clarification, commenting, and answering.
Check out our Code of Conduct.











I was creating a vector class using SSE intrinsics for my CPU-based ray tracer. I thought simply replacing regular vector operations will give me good performance gain. I compiled it with visual studio 2017 compiler in x86 and x64 release mode. In x86 it was crashing but In x64 it was working fine but no performance gain.



#ifndef INC_VEC4_H
#define INC_VEC4_H

#include<nmmintrin.h>

_declspec(align(16))
struct vec4
{
union {
__m128 vec;
struct { float x, y, z, w; };
struct { float r, g, b, a; };
};

//normalised directions
static vec4 UP;
static vec4 DOWN;
static vec4 ZERO;
static vec4 LEFT;
static vec4 RIGHT;
static vec4 FORWARD;
static vec4 BACKWARD;

//construction
vec4();
vec4(float x, float y, float z, float w = 1.0f);
explicit vec4(float n);
vec4(const vec4& other);
vec4& operator = (const vec4& other);
//destruction
~vec4();

inline float dot(const vec4& other)
{
__m128 dotResult = _mm_dp_ps(vec, other.vec, 0x7F);
float result;
_mm_store_ss(&result, dotResult);
return result;
}

inline void make_it_unit()
{
__m128 selfDot = _mm_dp_ps(vec, vec, 0x7F);
__m128 sqrtResult = _mm_rsqrt_ps(selfDot);
vec = _mm_mul_ps(vec, sqrtResult);
}

inline vec4 normalize()
{
vec4 result;
__m128 selfDot = _mm_dp_ps(vec, vec, 0x7F);
__m128 sqrtResult = _mm_rsqrt_ps(selfDot);
result.vec = _mm_mul_ps(vec, sqrtResult);
return result;
}

inline float length()
{
__m128 selfDot = _mm_dp_ps(vec, vec, 0x7F);
__m128 sqrtResult = _mm_sqrt_ps(selfDot);
float result;
_mm_store_ss(&result, sqrtResult);
return result;
}

//vec4-vec4 arithmetic operations
inline vec4 operator + (const vec4& other) const {
vec4 result;
result.vec = _mm_add_ps(vec, other.vec);
return result;
}
inline vec4 operator - (const vec4& other) const {
vec4 result;
result.vec = _mm_sub_ps(vec, other.vec);
return result;
}

//vec4-vec4 arithmetic operations
inline void operator += (const vec4& other) {
vec = _mm_add_ps(vec, other.vec);
}


inline vec4 operator * (const vec4& other) const {
vec4 result;
result.vec = _mm_mul_ps(vec, other.vec);
return result;
}
inline vec4 operator / (const vec4& other) const {
vec4 result;
result.vec = _mm_div_ps(vec, other.vec);
return result;
}
inline void operator *= (const vec4& other) {
vec = _mm_mul_ps(vec, other.vec);
}
inline void operator /= (const vec4& other) {
vec = _mm_div_ps(vec, other.vec);
}
//vec4-scalar * & /

inline vec4 operator / (float scalar) const {
vec4 result;
__m128 _scalar = _mm_set_ps(scalar, scalar, scalar, scalar);
result.vec = _mm_div_ps(vec, _scalar);
return result;
}
inline void operator *= (float scalar) {
__m128 _scalar = _mm_set_ps(scalar, scalar, scalar, scalar);
vec = _mm_mul_ps(vec, _scalar);
}
inline void operator /= (float scalar) {
__m128 _scalar = _mm_set_ps(scalar, scalar, scalar, scalar);
vec = _mm_div_ps(vec, _scalar);
}



inline float squared_length() const {
float result;
__m128 dotResult = _mm_dp_ps(vec, vec, 0x7F);
_mm_store_ss(&result, dotResult);
return result;
}

inline vec4& make_itzero() {
vec = _mm_set_ps(0.0f, 0.0f, 0.0f, 0.0f);
}


inline vec4 cross(const vec4& other) const {
vec4 result;
result.vec = _mm_sub_ps(
_mm_mul_ps(_mm_shuffle_ps(vec, vec, _MM_SHUFFLE(3, 0, 2, 1)), _mm_shuffle_ps(other.vec, other.vec, _MM_SHUFFLE(3, 1, 0, 2))),
_mm_mul_ps(_mm_shuffle_ps(vec, vec, _MM_SHUFFLE(3, 1, 0, 2)), _mm_shuffle_ps(other.vec, other.vec, _MM_SHUFFLE(3, 0, 2, 1)))
);
return result;
}

//checks
bool check_ifzero() const {

}
};

//non-member inline operators
inline vec4 operator * (const vec4& v, float scalar) {
vec4 result;
__m128 _scalar = _mm_set_ps(scalar, scalar, scalar, scalar);
result.vec = _mm_mul_ps(v.vec, _scalar);
return result;
}

inline vec4 operator * (float scalar, const vec4& v) {
vec4 result;
__m128 _scalar = _mm_set_ps(scalar, scalar, scalar, scalar);
result.vec = _mm_mul_ps(v.vec, _scalar);
return result;
}

inline float dot(const vec4& v1, const vec4& v2) {
__m128 dotResult = _mm_dp_ps(v1.vec, v2.vec, 0x7F);
float result;
_mm_store_ss(&result, dotResult);
return result;
}

#endif


Can someone explain what's wrong here?







c++ performance coordinate-system sse raytracing






share|improve this question









New contributor




Ankit Singh is a new contributor to this site. Take care in asking for clarification, commenting, and answering.
Check out our Code of Conduct.











share|improve this question









New contributor




Ankit Singh is a new contributor to this site. Take care in asking for clarification, commenting, and answering.
Check out our Code of Conduct.









share|improve this question




share|improve this question








edited Nov 14 at 13:27









200_success

127k15148410




127k15148410






New contributor




Ankit Singh is a new contributor to this site. Take care in asking for clarification, commenting, and answering.
Check out our Code of Conduct.









asked Nov 14 at 10:07









Ankit Singh

11




11




New contributor




Ankit Singh is a new contributor to this site. Take care in asking for clarification, commenting, and answering.
Check out our Code of Conduct.





New contributor





Ankit Singh is a new contributor to this site. Take care in asking for clarification, commenting, and answering.
Check out our Code of Conduct.






Ankit Singh is a new contributor to this site. Take care in asking for clarification, commenting, and answering.
Check out our Code of Conduct.




put on hold as unclear what you're asking by Toby Speight, πάντα ῥεῖ, Graipher, Edward, Mast Nov 14 at 13:29


Please clarify your specific problem or add additional details to highlight exactly what you need. As it's currently written, it’s hard to tell exactly what you're asking. See the How to Ask page for help clarifying this question. If this question can be reworded to fit the rules in the help center, please edit the question.






put on hold as unclear what you're asking by Toby Speight, πάντα ῥεῖ, Graipher, Edward, Mast Nov 14 at 13:29


Please clarify your specific problem or add additional details to highlight exactly what you need. As it's currently written, it’s hard to tell exactly what you're asking. See the How to Ask page for help clarifying this question. If this question can be reworded to fit the rules in the help center, please edit the question.














  • What happened to check_ifzero()?
    – 200_success
    Nov 14 at 13:28


















  • What happened to check_ifzero()?
    – 200_success
    Nov 14 at 13:28
















What happened to check_ifzero()?
– 200_success
Nov 14 at 13:28




What happened to check_ifzero()?
– 200_success
Nov 14 at 13:28















active

oldest

votes






















active

oldest

votes













active

oldest

votes









active

oldest

votes






active

oldest

votes

Popular posts from this blog

Morgemoulin

Scott Moir

Souastre