Algorithm example 4 - Cone quad intersection
A cone is a geometrical shape similar to the waffle under ice cream and a quad is a surface polygon defined by four points in three dimensions. Parallelism inside an ordinary cone quad intersection will be utilized instead of intersecting cone against four quads or four cones against quad. This has the advantage, that nothing outside of the function has to be changed and the disadvantage, that not all parts of the algorithm can be vectorized.
To intersect a cone against a quad three things have to be checked: First whether any of the quad's vertices are inside the cone. Secondly whether any edge of the quad intersects the cone. Thirdly whether the axis of the cone intersects the quad. These can be done in arbitrary order and the order should be selected on the basis of probability for a test to succeed in an application.
Implementation here is based on "Intersection of a Triangle and a Cone by David Eberly", a copy of which is distributed with the source package. The SSE version was implemented before the scalar version:
//	D = cone direction
//	L = cone starting position
//	cphi2 = cosine power 2 of cone angle
//	P = Quad vertices
//
noinline bool ConeQuadIntersection(const vec3& D, const vec3& L, const float& cphi2, mat4x3& P)
{
vec4 LP0 = P[0] - L[0];
vec4 LP1 = P[1] - L[1];
vec4 LP2 = P[2] - L[2];
vec4 LPLP = LP0*LP0 + LP1*LP1 + LP2*LP2;
vec4 cLPLP = cphi2*LPLP;
vec4 LP0D = LP0*D[0] + LP1*D[1] + LP2*D[2];
vec4 LP0D2= LP0D*LP0D;
// check if any vertex is inside cone
vec4b conesidemask = LP0D >= 0.0f;
if (ForAny(conesidemask))
	{
	vec4b insidemask = LP0D2 >= cLPLP;
	if (ForAny(insidemask && conesidemask)) // any point inside?
		return true;
	}
else
	return false; // all on other side of the plane
// Check if any edge intersects cone
vec4 E0,E1,E2; // Edge vectors
E0[0] = P[0][1] - P[0][0];
E0[1] = P[0][2] - P[0][1];
E0[2] = P[0][3] - P[0][2];
E0[3] = P[0][0] - P[0][3];
E1[0] = P[1][1] - P[1][0];
E1[1] = P[1][2] - P[1][1];
E1[2] = P[1][3] - P[1][2];
E1[3] = P[1][0] - P[1][3];
E2[0] = P[2][1] - P[2][0];
E2[1] = P[2][2] - P[2][1];
E2[2] = P[2][3] - P[2][2];
E2[3] = P[2][0] - P[2][3];
vec4 DE = D[0]*E0 + D[1]*E1 + D[2]*E2;
vec4 c0 = LP0D2 - cLPLP;
vec4 c1 = DE * LP0D - cphi2 * (E0*LP0 + E1*LP1 + E2*LP2 );
vec4 Elen2 = E0*E0 + E1*E1 + E2*E2;
vec4 c2 = DE*DE - cphi2 * Elen2;
vec4b mask = c2 < 0.0f && c1*c1 >= c0*c2;
if (ForAny(mask)) // some edge can intersect
	{
	if (ForAll(conesidemask)) // all on the cone side
		{
		vec4b mask2 = mask && 0.0f <= c1 && c1 <= (-1.0f*c2);
		if (ForAny(mask2)) // edge intersects
			return true;
		}
	else
		{
		// unimplemented
		}
	}
// Check if cone axis intersects the quad
vec3 hitpoint;
vec3 planepoint;
planepoint[0]=P[0][0];
planepoint[1]=P[1][0];
planepoint[2]=P[2][0];
vec3 planenormal;
planenormal[0]=E1[0]*E2[1]-E1[1]*E2[0];
planenormal[1]=-E0[0]*E2[1]+E0[1]*E2[0];
planenormal[2]=E0[0]*E1[1]-E0[1]*E1[0];
float length = 1.0f/sqrtf(planenormal[0]*planenormal[0] + planenormal[1]*planenormal[1] + planenormal[2]*planenormal[2]);
planenormal *= length;
if (!RayPlaneIntersection(L, D, planenormal, planepoint, hitpoint))
	return false;
vec3 point = hitpoint - planepoint;
float p1 = ( point[0] * E0[0] + point[1] * E1[0] + point[2] * E2[0] ) / Elen2[0];
float p2 = ( point[0] * E0[1] + point[1] * E1[1] + point[2] * E2[1] ) / Elen2[1];
if (p1 >= 0.0f && p1 <= 1.0f && p2 >= 0.0f && p2 <= 1.0f) // cone inside quad?
	return true;
else
	return false;
}
Basically the first part (vertices inside cone) is vectorized, the second part (edges intersect cone) is vectorized and the last part (cone axis intersects quad) is not. The parallelism comes from there being four vertices and four edges to check in identical way. However an optimal implementation is little different between the scalar and vectorized version. The optimal vectorized version was perhaps even easier to code than the scalar one, because it is very straightforward. In the scalar version if the first vertex or edge test succeeds then the function can immediately return, which lowers the benefit from the vectorized version. The optimal edges to test are also different between the two.
// define scalar versions of vec4 and mat4x3
typedef n_std::cvalarray<n_std::cvalarray<float,4>,3> smat4x3;
typedef n_std::cvalarray<float,4> svec4;
typedef n_std::cvalarray<bool,4> svec4b;
bool ConeQuadIntersection(const vec3& dir, const vec3& pos, const float cosphi2, const smat4x3& P)
{
svec4 pospx,pospy,pospz;
svec4 pospdir;
svec4 posp2;
svec4 pospdir2;
bool anyin=false;
svec4b otherside;
// Check if any of the points in inside the cone
for (int i=0;i<4;i++)
	{
	pospx[i]=P[0][i] - pos[0];
	pospy[i]=P[1][i] - pos[1];
	pospz[i]=P[2][i] - pos[2];
	pospdir[i] = pospx[i] * dir[0] + pospy[i] * dir[1] + pospz[i] * dir[2];
	
	if (pospdir[i] > 0)
		{
		anyin=true;
		otherside[i]=false;
		posp2[i] = pospx[i]*pospx[i] + pospy[i]*pospy[i] + pospz[i]*pospz[i];
		pospdir2[i] = pospdir[i]*pospdir[i];
		if (pospdir2[i] >= cosphi2 * posp2[i])
			return true;
		}
	else
		otherside[i]=true;
	}
// all on other side of cone
if (!anyin)
	return false;
// Next test edges of the quad for intersection
vec3 edge1;
edge1[0] = P[0][1] - P[0][0];
edge1[1] = P[1][1] - P[1][0];
edge1[2] = P[2][1] - P[2][0];
float edge1l2 = edge1[0]*edge1[0] + edge1[1]*edge1[1] + edge1[2]*edge1[2];
if (!otherside[0] && !otherside[1])
	{
	float diredge = dir[0]*edge1[0] + dir[1]*edge1[1] + dir[2]*edge1[2];
	float c0 = pospdir2[0] - cosphi2*posp2[0];
	float c1 = diredge * pospdir[0] - cosphi2 * (edge1[0]*pospx[0] + edge1[1]*pospy[0] + edge1[2]*pospz[0]);
	float c2 = diredge*diredge - cosphi2 * edge1l2;
	if (c2 < 0.0f && 0.0f <= c1 && c1 <= (-1.0f)*c2 && c1*c1 >= c0*c2)
		return true;
	}
vec3 edge2;
edge2[0] = P[0][2] - P[0][0];
edge2[1] = P[1][2] - P[1][0];
edge2[2] = P[2][2] - P[2][0];
float edge2l2 = edge2[0]*edge2[0] + edge2[1]*edge2[1] + edge2[2]*edge2[2];
if (!otherside[0] && !otherside[2])
        {
        float diredge = dir[0]*edge2[0] + dir[1]*edge2[1] + dir[2]*edge2[2];
        float c0 = pospdir2[0] - cosphi2*posp2[0];
        float c1 = diredge * pospdir[0] - cosphi2 * (edge2[0]*pospx[0] + edge2[1]*pospy[0] + edge2[2]*pospz[0]);
        float c2 = diredge*diredge - cosphi2 * edge2l2;
        if (c2 < 0.0f && 0.0f <= c1 && c1 <= (-1.0f)*c2 && c1*c1 >= c0*c2;
                return true;
        }
for (int i=1;i<=2;i++)
{
if (!otherside[3] && !otherside[i])
        {
        vec3 edge;
        edge[0] = P[0][i] - P[0][3];
        edge[1] = P[1][i] - P[1][3];
        edge[2] = P[2][i] - P[2][3];
        float diredge = dir[0]*edge[0] + dir[1]*edge[1] + dir[2]*edge[2];
        float c0 = pospdir2[3] - cosphi2*posp2[3];
        float c1 = diredge * pospdir[3] - cosphi2 * (edge[0]*pospx[3] + edge[1]*pospy[3] + edge[2]*pospz[3]);
        float c2 = diredge*diredge - cosphi2 * (edge[0]*edge[0] + edge[1]*edge[1] + edge[2]*edge[2]);
        if (c2 < 0.0f && 0.0f <= c1 && c1 <= (-1.0f)*c2 && c1*c1 >= c0*c2)
                return true;
        }
}
// Next check if cone axis intersects quad
vec3 hitpoint;
vec3 planepoint;
planepoint[0]=P[0][0];
planepoint[1]=P[1][0];
planepoint[2]=P[2][0];
vec3 planenormal;
planenormal[0]=edge1[1]*edge2[2]-edge2[1]*edge1[2];
planenormal[1]=-edge1[0]*edge2[2]+edge2[0]*edge1[2];
planenormal[2]=edge1[0]*edge2[1]-edge2[0]*edge1[1];
float length = 1.0f/sqrtf(planenormal[0]*planenormal[0] + planenormal[1]*planenormal[1] + planenormal[2]*planenormal[2]);
planenormal *= length;
if (!RayPlaneIntersection(pos, dir, planenormal, planepoint, hitpoint))
        return false;
vec3 point = hitpoint - planepoint;
float p1 = ( point[0] * edge1[0] + point[1] * edge1[1] + point[2] * edge1[2] ) / edge1l2;
float p2 = ( point[0] * edge2[0] + point[1] * edge2[1] + point[2] * edge2[2] ) / edge2l2;
if (p1 >= 0.0f && p1 <= 1.0f && p2 >= 0.0f && p2 <= 1.0f) // cone inside quad?
        return true;
else
        return false;
}
The measured speedup from the vectorized version is summarized in the following table. The different scenarios mean the data is created so that the condition listed on the left column always happens for all the quads tested.
| Speedup factor for intersecting a dataset of 10000 quads against a cone | |
| First tested vertex was inside | 0.95x | 
| Second tested vertex was inside | 1.26x | 
| Third tested vertex was inside | 1.49x | 
| Fourth tested was vertex inside | 1.79x | 
| First tested edge intersected | 1.10x | 
| Second tested edge intersected | 1.22x | 
| Third tested edge intersected | 1.58x | 
| Fourth tested edge intersected | 2.00x | 
| Cone axis intersects quad | 1.85x | 
The vectorized version gives a definite benefit, however, at best only doubling the performance. The vectorized version was not any harder to implement than the scalar version.