#ifndef __NVEC_H
#define __NVEC_H

// n-dim vectorclass
// by Johannes Lampel / LampelSoft
// (c)  copyright Johannes Lampel
// written Sept 13rd 2000

//#include "GlobalCqSOMDefs.h"
#include "nVecErr.h"
#include <stdio.h>
#include <limits.h>
#include <math.h>
#include <memory.h>
#include <iostream>
using namespace std;

typedef float nVec_var ;

#ifdef __DLL
#define DllExport __declspec( dllexport )
#else
#define DllExport
#endif
typedef long InitType;
typedef long dim_c;

#define __SSE
#ifdef __SSE
#include <xmmintrin.h>
#endif

#define __SSEOP(a) (1+((int)a-1)/4)

enum nVecTypes{
	nVT_copy=0,
	nVT_mem
};

class nVec;

nVec operator *(nVec_var,nVec&);

class DllExport nVec{
	friend nVec operator *(nVec_var,nVec&);
	friend class SOMPattern;
public:
	nVec()
	{
		lCDim = 0;
		iType = -1;
		ppdDim = 0;
	}
	nVec(const dim_c dcDim)
	{
#ifndef __SSE
		ppdDim = new nVec_var[dcDim];
#else
		ppdDim = (nVec_var*) _aligned_malloc( __SSEOP(dcDim)*4 * sizeof(nVec_var), 16);
#endif
		lCDim = dcDim;
		iType = nVT_copy;							// copy data by default
	}
	nVec(const dim_c dcDim,const nVec_var * const dptr)
	{
#ifndef __SSE
		ppdDim = new nVec_var[dcDim];
#else
		ppdDim = (nVec_var*) _aligned_malloc( __SSEOP(dcDim)*4 * sizeof(nVec_var), 16);
#endif
		lCDim = dcDim;
		memcpy(ppdDim,dptr,sizeof(nVec_var) * lCDim);
		iType = nVT_copy;							// copy data by default
	}
	nVec(const nVec &npVec){
		ppdDim = 0;
		iType = nVT_copy;							// copy data by default
		this->operator = (npVec);
	}

	virtual ~nVec();

	nVec_var	Length(void)					// ...
	{
		return sqrt(SLength());
	}

	int		Normalize(void)						// ...
	{
		nVec_var dLength = Length();
		if(dLength==0.0){
			dLength=1.0;
			return 1;
		}
		dLength = 1.f / dLength;

		this->operator *=(dLength);
		return 1;
	}

	int		GetVec(nVec_var *dptr)						// copy data from field into this class
	{
		memcpy(ppdDim,dptr,sizeof(nVec_var) * lCDim);
		return 1;
	}

	int		SetMemVec(nVec_var *dpParam)				// set ddpDim to Param and iType to nVT_mem - be careful, you havn't a copy of this data, your changing the original data
	{
		// setting ppdDim to mempos and iType to nVT_mem (case del of old alloc mem)
		if(iType == nVT_copy){
#ifdef __SSE
			if(ppdDim) _aligned_free ( ppdDim );
#else
			if(ppdDim) delete [] ( ppdDim );
#endif
		}
		ppdDim = dpParam;
		iType = nVT_mem;
		return 1;
	}

	int		ConvMemtoCopy(void)							// convert a Mem - typed instance of this class to a instance, which contains a copy of this data items - alloc mem and copy
	{
		nVec_var *dpTmp;
#ifdef __SSE
		dpTmp = (nVec_var*) _aligned_malloc(__SSEOP(lCDim)*4 * sizeof(nVec_var), 16);
#else
		dpTmp = new nVec_var[lCDim];
#endif
		memcpy(dpTmp,ppdDim,sizeof(nVec_var) * lCDim);
		ppdDim = dpTmp;
		iType = nVT_copy;
		return 1;
	}

	int		SetVec(nVec_var *dpDest)						// copy data from class to field
	{
		memcpy(dpDest,ppdDim,sizeof(nVec_var) * lCDim);
		return 1;
	}
	nVec_var	SLength(void)								// return sum of all quadrs of comps
	{
#ifndef SHIT			// normal is faster
		nVec_var dQSum = 0,dTemp;							// sum of quadr
		long lschl;
		for(lschl = 0;lschl < lCDim;lschl ++){
			dTemp = ppdDim[lschl];
			dQSum += dTemp * dTemp;
		}
		return dQSum;
#else
		long lSize = __SSEOP(lCDim);
		for(lschl = lCDim + 1; lschl < lSize*4; lschl ++){
			ppdDim[lschl] = 0;
		}
		/*long lSize = lCDim / 4,lschl;

		for(lschl = lSize*16/sizeof(nVec_var);lschl < lCDim;lschl ++){		// zero extra aligned data ( cause that's not officially belonging to this vector )
			ppdDim[lschl] = 0.f;
		}
*/
		__m128 *add1 = (__m128*)ppdDim;
		__m128 sum;

		sum = _mm_setzero_ps();

		for(; lSize; lSize--){
			sum = _mm_add_ps(sum,_mm_mul_ps(*add1,*add1));

			add1++;
		}

		return  sum.m128_f32[0] + sum.m128_f32[1] + sum.m128_f32[2] + sum.m128_f32[3];
#endif
	}

	nVec_var getMax(void){
		nVec_var max = -1000000000;
		long lschl;
		for(lschl = 0;lschl < lCDim;lschl ++){
			if(ppdDim[lschl] > max){
				max = ppdDim[lschl];
			}
		}
		return max;
	}
	nVec_var getMin(void){
		nVec_var min = 1000000000;
		long lschl;
		for(lschl = 0;lschl < lCDim;lschl ++){
			if(ppdDim[lschl] < min){
				min = ppdDim[lschl];
			}
		}
		return min;
	}

	nVec_var	CBLength(void);			// return CBDist
	void	Zero(void)
	{
		this->operator =(0);
	}

	const nVec & operator = (const nVec_var fValue){
#ifndef __SSE
		long lschl;
		for(lschl=0;lschl < lCDim;lschl++){
			ppdDim[lschl] = fValue;
		}
#else
		__m128 m128_value;
		long lSize = __SSEOP(lCDim) ;
		__m128 *dest = (__m128*)ppdDim;

		m128_value = _mm_set_ps1(fValue);

		for(; lSize; lSize--){
			*dest = m128_value;
			dest ++;
		}
#endif
		return ((const nVec &)(*this));
	}

	void	ZeroAligned(void){
		// somewhat not working on SSE
		// this function should set all those bytes not needed for us to zero so we dont need to worry about NaN issues
#ifdef __SSE
		long lschl;
		long lSize = __SSEOP(lCDim);
		for(lschl = lCDim ; lschl < lSize*4; lschl ++){
			cout << lschl << endl;
			ppdDim[lschl] = 0;
		}
#else
#endif
	}

	int		SetNumDim(const dim_c dcDim){
#ifdef __SSE
		if(iType == nVT_copy && !ppdDim) _aligned_free ( ppdDim );
		lCDim = dcDim;
		ppdDim = ppdDim = (nVec_var*) _aligned_malloc(__SSEOP(dcDim)*4 * sizeof(nVec_var), 16);
#else
		if(iType == nVT_copy && !ppdDim) delete [] ( ppdDim );;
		lCDim = dcDim;
		ppdDim = new nVec_var[dcDim];
#endif
		return 1;
	}

	int		Load(FILE *);
	int		Save(FILE *);

	void testout(void);

	void _smult(const nVec &f1,const nVec &f2){
		// multiplying f1 and f2 componentwise and storing it in this instance
		if(f1.lCDim != f2.lCDim){
			throw IncompatibleVectors();
		}
		if(iType == nVT_copy){
			if(f1.lCDim != lCDim){		// if the dimensions are different, let's get the appropriate size
#ifdef __SSE
				if(ppdDim) _aligned_free(ppdDim);
				ppdDim = (nVec_var*) _aligned_malloc(__SSEOP(f1.lCDim)*4 * sizeof(nVec_var), 16);
#else
				if(ppdDim) delete [] ppdDim;
				ppdDim = new nVec_var[f1.lCDim];
#endif
			}
		}
		else{
			if(f1.lCDim != lCDim){
				throw IncompatibleVectors();
			}
		}
		// let's do the multiplication
#ifdef __SSE
		long lSize = __SSEOP(lCDim);

		__m128 *add1 = (__m128*)f1.ppdDim;			// having three different pointers and increasing them each time is faster than having just one index and recalculating the actual pointers ... at max that is up to 30% faster
		__m128 *add2 = (__m128*)f2.ppdDim;
		__m128 *add3 = (__m128*)ppdDim;

		for(; lSize; lSize--){
			*add3 = _mm_mul_ps(*add1,*add2);

			add1++;
			add2++;
			add3++;
		}
		/*long lSize = __SSEOP(lCDim);
		int index=0;

		for(; lSize; lSize--){
			*(__m128*)(ppdDim+index*4) = _mm_mul_ps(			// sizeof(__m128) / sizeif(float) = 4
				*(__m128*)(f1.ppdDim + index*4),
				*(__m128*)(f2.ppdDim + index*4) );

			index++;
		}*/
#else
		long lschl;
		for(lschl=0;lschl<lCDim;lschl++){
			ppdDim[lschl] = f1.ppdDim[lschl] * f2.ppdDim[lschl];
		}
#endif
	}

	void _sminusv(nVec_var fValue){
#ifdef __SSE
		__m128 m128_value;
		long lSize = __SSEOP(lCDim) ;
		__m128 *src = (__m128*)ppdDim;

		m128_value = _mm_set_ps1(fValue);

		for(; lSize; lSize--){
			*src = _mm_sub_ps(m128_value,*src);

			src ++;
		}
#else
		long lschl;
		for(lschl=0;lschl < lCDim;lschl++){
			ppdDim[lschl] = fValue - ppdDim[lschl];
		}
#endif
	}

	const nVec & operator = (const nVec &NParam)	// ...
	{
		try{
			if(iType == nVT_copy){
				if(NParam.lCDim != lCDim){		// if the dimensions are different, let's get the appriate size
#ifdef __SSE
					if(ppdDim) _aligned_free(ppdDim);
					ppdDim = (nVec_var*) _aligned_malloc(__SSEOP(NParam.lCDim)*4 * sizeof(nVec_var), 16);
#else
					if(ppdDim) delete [] ppdDim;
					ppdDim = new nVec_var[NParam.lCDim];
#endif
				}
				lCDim = NParam.lCDim;
#ifndef SHIT		// it's faster the nonSSE way
				memcpy(ppdDim,NParam.ppdDim,sizeof(nVec_var)*lCDim);
#else
				// this is slower using SSE
				long lSize = __SSEOP(lCDim);
				__m128 *dest = (__m128*)ppdDim;
				__m128 *src = (__m128*)NParam.ppdDim;

				for(; lSize; lSize--){
					_mm_store_ps ( (float *)dest,_mm_load_ps((float *)src));
					dest ++;
					src ++;
				}
#endif
			}
			else{
				if(lCDim != NParam.lCDim){
					throw IncompatibleVectors();
				}
				memcpy(ppdDim,NParam.ppdDim,sizeof(nVec_var)*lCDim);
			}
			return ((const nVec &)(*this));
		}
		catch(IncompatibleVectors IV){
			cerr << " 'IncompatibleVectors' thrown by = " << lCDim << " + " << NParam.lCDim <<" : "<< &NParam <<endl;
			throw (IV);
		}
	}
	const nVec & operator +=(const nVec &NParam)	// ...
	{
		try{
			if(lCDim != NParam.lCDim){
				throw IncompatibleVectors();
			}
#ifndef __SSE
			long lschl;
			for(lschl=0;lschl<lCDim;lschl++){
				ppdDim[lschl] += NParam.ppdDim[lschl];
			}
#else
			long lSize = __SSEOP(lCDim);

			__m128 *add1 = (__m128*)ppdDim;
			__m128 *add2 = (__m128*)NParam.ppdDim;

			for(; lSize; lSize--){
				*add1 = _mm_add_ps(*add1,*add2);

				add1++;
				add2++;
			}
#endif
		}
		catch(IncompatibleVectors IV){
			cerr << " 'IncompatibleVectors' thrown by += " << lCDim << " + " << NParam.lCDim <<" : "<< &NParam <<endl;
			throw(IV);
		}
		return ((const nVec &)(*this));
	}
	const nVec & operator -=(const nVec &NParam)	// ...
	{
		try{
			if(lCDim != NParam.lCDim){
				throw IncompatibleVectors();
			}
#ifndef __SSE
			long lschl;
			for(lschl=0;lschl < lCDim;lschl++){
				ppdDim[lschl] -= NParam.ppdDim[lschl];
			}
#else
			long lSize = __SSEOP(lCDim);

			__m128 *sub1 = (__m128*)ppdDim;
			__m128 *sub2 = (__m128*)NParam.ppdDim;
			__m128 dest;

			for(; lSize; lSize--){
				dest = _mm_sub_ps(*sub1,*sub2);
				*sub1 = dest;

				sub1++;
				sub2++;
			}
#endif
		}
		catch(IncompatibleVectors IV){
			cerr << " 'IncompatibleVectors' thrown by -= " << lCDim << " + " << NParam.lCDim <<" : "<< &NParam <<endl;
			throw(IV);
		}
		return ((const nVec &)(*this));
	}
	const nVec & operator *=(const nVec_var dParam)	// ...
	{
#ifndef __SSE
		long lschl;
		for(lschl=0;lschl<lCDim;lschl++){
			ppdDim[lschl] *= dParam;
		}
#else
		__m128 m128_fac;

		m128_fac = _mm_set_ps1(dParam);

		long lSize = __SSEOP(lCDim);

		__m128 *dest = (__m128*)ppdDim;

		for(; lSize; lSize--){
			*dest = _mm_mul_ps(*dest,m128_fac);
			dest ++;
		}
#endif
		return ((const nVec &)(*this));
	}
	const nVec & operator /=(const nVec_var dParam)	// ...
	{
		nVec_var fFac = 1.f / dParam;

		this->operator *= (fFac);

		return ((const nVec &)(*this));
	}

	nVec		operator +(const nVec &);	// ...
	nVec		operator -(const nVec &);	// ...
	nVec		operator *(const nVec_var);	// ...
	nVec		operator /(const nVec_var);	// ...

	nVec_var &	operator [](long lIndex)			// return value of [dimension] component
	{
		return ((nVec_var &)(ppdDim[lIndex]));
	}
	nVec_var *getData(void){return ppdDim;}
//protected:
	nVec_var *ppdDim;									// data items
	long lCDim;										// number of dimensions
	int iType;										// copy data or set pointer to data
private:
};

nVec  operator -(const nVec_var f, const nVec &NParam);

#endif  __NVEC_H
