diff --git a/trunk/fcl/CMakeLists.txt b/trunk/fcl/CMakeLists.txt
index 0a5e0a2f85f9e61205ebdb0c2af8d38c95826de0..9f334478726d8c1fe61f790c7397ad606e50912d 100644
--- a/trunk/fcl/CMakeLists.txt
+++ b/trunk/fcl/CMakeLists.txt
@@ -37,7 +37,7 @@ link_directories(${CCD_LIBRARY_DIRS})
 
 add_definitions(-DUSE_SVMLIGHT=0)
 
-add_library(${PROJECT_NAME} SHARED src/BV/AABB.cpp src/BV/OBB.cpp src/BV/RSS.cpp src/BV/kIOS.cpp src/vec_3f.cpp src/traversal_node_base.cpp src/traversal_node_bvhs.cpp src/intersect.cpp src/motion.cpp src/BV_fitter.cpp src/BV_splitter.cpp src/BVH_model.cpp src/BVH_utility.cpp src/transform.cpp src/simple_setup.cpp src/geometric_shapes.cpp src/geometric_shapes_utility.cpp src/geometric_shapes_intersect.cpp src/collision_node.cpp src/traversal_recurse.cpp src/broad_phase_collision.cpp src/collision.cpp src/collision_func_matrix.cpp src/interval_tree.cpp src/conservative_advancement.cpp src/matrix_3f.cpp src/interval.cpp src/interval_vector.cpp src/interval_matrix.cpp src/taylor_model.cpp src/taylor_vector.cpp src/taylor_matrix.cpp src/distance_func_matrix.cpp src/distance.cpp)
+add_library(${PROJECT_NAME} SHARED src/BV/AABB.cpp src/BV/OBB.cpp src/BV/RSS.cpp src/BV/kIOS.cpp src/traversal_node_base.cpp src/traversal_node_bvhs.cpp src/intersect.cpp src/motion.cpp src/BV_fitter.cpp src/BV_splitter.cpp src/BVH_model.cpp src/BVH_utility.cpp src/transform.cpp src/simple_setup.cpp src/geometric_shapes.cpp src/geometric_shapes_utility.cpp src/geometric_shapes_intersect.cpp src/collision_node.cpp src/traversal_recurse.cpp src/broad_phase_collision.cpp src/collision.cpp src/collision_func_matrix.cpp src/interval_tree.cpp src/conservative_advancement.cpp src/matrix_3f.cpp src/interval.cpp src/interval_vector.cpp src/interval_matrix.cpp src/taylor_model.cpp src/taylor_vector.cpp src/taylor_matrix.cpp src/distance_func_matrix.cpp src/distance.cpp)
 
 target_link_libraries(${PROJECT_NAME} ${FLANN_LIBRARIES} ${CCD_LIBRARIES})
 
diff --git a/trunk/fcl/include/fcl/collision_object.h b/trunk/fcl/include/fcl/collision_object.h
index eb062de64e144077983b3dc231afbce9583677d6..5de9b44cc41bbbcb478f5bbc4e682b010656e71d 100644
--- a/trunk/fcl/include/fcl/collision_object.h
+++ b/trunk/fcl/include/fcl/collision_object.h
@@ -131,12 +131,6 @@ public:
     aabb.max_ = center + delta;
   }
 
-  inline void computeAABB2()
-  {
-    Vec3f center = t.transform(cgeom->aabb_center);
-    // compute new r1, r2, r3
-  }
-
   void* getUserData() const
   {
     return user_data;
diff --git a/trunk/fcl/include/fcl/deprecated/vec_3f.h b/trunk/fcl/include/fcl/deprecated/vec_3f.h
new file mode 100644
index 0000000000000000000000000000000000000000..52594d30b33c5516fb39d658e736bde27d0107a8
--- /dev/null
+++ b/trunk/fcl/include/fcl/deprecated/vec_3f.h
@@ -0,0 +1,301 @@
+/*
+ * Software License Agreement (BSD License)
+ *
+ *  Copyright (c) 2011, Willow Garage, Inc.
+ *  All rights reserved.
+ *
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *
+ *   * Redistributions of source code must retain the above copyright
+ *     notice, this list of conditions and the following disclaimer.
+ *   * Redistributions in binary form must reproduce the above
+ *     copyright notice, this list of conditions and the following
+ *     disclaimer in the documentation and/or other materials provided
+ *     with the distribution.
+ *   * Neither the name of Willow Garage, Inc. nor the names of its
+ *     contributors may be used to endorse or promote products derived
+ *     from this software without specific prior written permission.
+ *
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *  FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *  COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *  INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *  BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *  LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *  CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *  LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *  ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *  POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \author Jia Pan */
+
+
+#ifndef FCL_DEPRECATED_VEC_3F_H
+#define FCL_DEPRECATED_VEC_3F_H
+
+namespace fcl
+{
+
+namespace deprecated
+{
+
+
+/** \brief A class describing a three-dimensional vector */
+class Vec3f
+{
+public:
+  /** \brief vector data */
+  BVH_REAL v_[3];
+
+  Vec3f() { v_[0] = 0; v_[1] = 0; v_[2] = 0; }
+
+  Vec3f(const Vec3f& other)
+  {
+    memcpy(v_, other.v_, sizeof(BVH_REAL) * 3);
+  }
+
+  Vec3f(const BVH_REAL* v)
+  {
+    memcpy(v_, v, sizeof(BVH_REAL) * 3);
+  }
+
+  Vec3f(BVH_REAL x, BVH_REAL y, BVH_REAL z)
+  {
+    v_[0] = x;
+    v_[1] = y;
+    v_[2] = z;
+  }
+
+  ~Vec3f() {}
+
+  /** \brief Get the ith element */
+  inline BVH_REAL operator [] (size_t i) const
+  {
+    return v_[i];
+  }
+
+  inline BVH_REAL& operator[] (size_t i)
+  {
+    return v_[i];
+  }
+
+  /** \brief Add the other vector */
+  inline Vec3f& operator += (const Vec3f& other)
+  {
+    v_[0] += other.v_[0];
+    v_[1] += other.v_[1];
+    v_[2] += other.v_[2];
+    return *this;
+  }
+
+
+  /** \brief Minus the other vector */
+  inline Vec3f& operator -= (const Vec3f& other)
+  {
+    v_[0] -= other.v_[0];
+    v_[1] -= other.v_[1];
+    v_[2] -= other.v_[2];
+    return *this;
+  }
+
+  inline Vec3f& operator *= (BVH_REAL t)
+  {
+    v_[0] *= t;
+    v_[1] *= t;
+    v_[2] *= t;
+    return *this;
+  }
+
+  /** \brief Negate the vector */
+  inline Vec3f& negate()
+  {
+    v_[0] = - v_[0];
+    v_[1] = - v_[1];
+    v_[2] = - v_[2];
+    return *this;
+  }
+
+  /** \brief Return a negated vector */
+  inline Vec3f operator - () const
+  {
+    return Vec3f(-v_[0], -v_[1], -v_[2]);
+  }
+
+  /** \brief Return a summation vector */
+  inline Vec3f operator + (const Vec3f& other) const
+  {
+    return Vec3f(v_[0] + other.v_[0], v_[1] + other.v_[1], v_[2] + other.v_[2]);
+  }
+
+  /** \brief Return a substraction vector */
+  inline Vec3f operator - (const Vec3f& other) const
+  {
+    return Vec3f(v_[0] - other.v_[0], v_[1] - other.v_[1], v_[2] - other.v_[2]);
+  }
+
+  /** \brief Scale the vector by t */
+  inline Vec3f operator * (BVH_REAL t) const
+  {
+    return Vec3f(v_[0] * t, v_[1] * t, v_[2] * t);
+  }
+
+  /** \brief Return the cross product with another vector */
+  inline Vec3f cross(const Vec3f& other) const
+  {
+    return Vec3f(v_[1] * other.v_[2] - v_[2] * other.v_[1],
+                 v_[2] * other.v_[0] - v_[0] * other.v_[2],
+                 v_[0] * other.v_[1] - v_[1] * other.v_[0]);
+  }
+
+  /** \brief Return the dot product with another vector */
+  inline BVH_REAL dot(const Vec3f& other) const
+  {
+    return v_[0] * other.v_[0] + v_[1] * other.v_[1] + v_[2] * other.v_[2];
+  }
+
+  /** \brief Normalization */
+  inline bool normalize()
+  {
+    const BVH_REAL EPSILON = 1e-11;
+    BVH_REAL sqr_length = v_[0] * v_[0] + v_[1] * v_[1] + v_[2] * v_[2];
+    if(sqr_length > EPSILON * EPSILON)
+    {
+      BVH_REAL inv_length = (BVH_REAL)1.0 / (BVH_REAL)sqrt(sqr_length);
+      v_[0] *= inv_length;
+      v_[1] *= inv_length;
+      v_[2] *= inv_length;
+      return true;
+    }
+    return false;
+  }
+
+  inline Vec3f normalized() const
+  {
+    const BVH_REAL EPSILON = 1e-11;
+    BVH_REAL sqr_length = v_[0] * v_[0] + v_[1] * v_[1] + v_[2] * v_[2];
+    if(sqr_length > EPSILON * EPSILON)
+    {
+      BVH_REAL inv_length = (BVH_REAL)1.0 / (BVH_REAL)sqrt(sqr_length);
+      return *this * inv_length;
+    }
+    else
+    {
+      return *this;
+    }
+  }
+
+
+  /** \brief Return vector length */
+  inline BVH_REAL length() const
+  {
+    return sqrt(v_[0] * v_[0] + v_[1] * v_[1] + v_[2] * v_[2]);
+  }
+
+  /** \brief Return vector square length */
+  inline BVH_REAL sqrLength() const
+  {
+    return v_[0] * v_[0] + v_[1] * v_[1] + v_[2] * v_[2];
+  }
+
+  /** \brief Set the vector using new values */
+  inline void setValue(BVH_REAL x, BVH_REAL y, BVH_REAL z)
+  {
+    v_[0] = x; v_[1] = y; v_[2] = z;
+  }
+
+  /** \brief Set the vector using new values */
+  inline void setValue(BVH_REAL x)
+  {
+    v_[0] = x; v_[1] = x; v_[2] = x;
+  }
+
+  /** \brief Check whether two vectors are the same in value */
+  inline bool equal(const Vec3f& other) const
+  {
+    const BVH_REAL EPSILON = 1e-11;
+    return ((v_[0] - other.v_[0] < EPSILON) &&
+            (v_[0] - other.v_[0] > -EPSILON) &&
+            (v_[1] - other.v_[1] < EPSILON) &&
+            (v_[1] - other.v_[1] > -EPSILON) &&
+            (v_[2] - other.v_[2] < EPSILON) &&
+            (v_[2] - other.v_[2] > -EPSILON));
+  }
+
+  inline BVH_REAL triple(const Vec3f& v1, const Vec3f& v2) const
+  {
+    return v_[0] * (v1.v_[1] * v2.v_[2] - v1.v_[2] * v2.v_[1]) +
+      v_[1] * (v1.v_[2] * v2.v_[0] - v1.v_[0] * v2.v_[2]) +
+      v_[2] * (v1.v_[0] * v2.v_[1] - v1.v_[1] * v2.v_[0]);
+  }
+};
+
+
+
+
+/** \brief The minimum of two vectors */
+inline Vec3f min(const Vec3f& a, const Vec3f& b)
+{
+  Vec3f ret(std::min(a[0], b[0]), std::min(a[1], b[1]), std::min(a[2], b[2]));
+  return ret;
+}
+
+/** \brief the maximum of two vectors */
+inline Vec3f max(const Vec3f& a, const Vec3f& b)
+{
+  Vec3f ret(std::max(a[0], b[0]), std::max(a[1], b[1]), std::max(a[2], b[2]));
+  return ret;
+}
+
+inline Vec3f abs(const Vec3f& v)
+{
+  BVH_REAL x = v[0] < 0 ? -v[0] : v[0];
+  BVH_REAL y = v[1] < 0 ? -v[1] : v[1];
+  BVH_REAL z = v[2] < 0 ? -v[2] : v[2];
+
+  return Vec3f(x, y, z);
+}
+
+
+inline BVH_REAL triple(const Vec3f& a, const Vec3f& b, const Vec3f& c)
+{
+  return a.triple(b, c);
+}
+
+/** \brief generate a coordinate given a vector (i.e., generating three orthonormal vectors given a vector)
+ * w should be normalized
+ */
+static void generateCoordinateSystem(const Vec3f& w, Vec3f& u, Vec3f& v)
+{
+  BVH_REAL inv_length;
+  if(fabs(w[0]) >= fabs(w[1]))
+  {
+    inv_length = (BVH_REAL)1.0 / sqrt(w[0] * w[0] + w[2] * w[2]);
+    u[0] = -w[2] * inv_length;
+    u[1] = (BVH_REAL)0;
+    u[2] = w[0] * inv_length;
+    v[0] = w[1] * u[2];
+    v[1] = w[2] * u[0] - w[0] * u[2];
+    v[2] = -w[1] * u[0];
+  }
+  else
+  {
+    inv_length = (BVH_REAL)1.0 / sqrt(w[1] * w[1] + w[2] * w[2]);
+    u[0] = (BVH_REAL)0;
+    u[1] = w[2] * inv_length;
+    u[2] = -w[1] * inv_length;
+    v[0] = w[1] * u[2] - w[2] * u[1];
+    v[1] = -w[0] * u[2];
+    v[2] = w[0] * u[1];
+  }
+}
+
+} // deprecated
+
+} // fcl
+
+#endif
diff --git a/trunk/fcl/include/fcl/math_details.h b/trunk/fcl/include/fcl/math_details.h
new file mode 100644
index 0000000000000000000000000000000000000000..b5b8a3aa58cd6ca1cc1c200eddb7b247b8e0b510
--- /dev/null
+++ b/trunk/fcl/include/fcl/math_details.h
@@ -0,0 +1,158 @@
+/*
+ * Software License Agreement (BSD License)
+ *
+ *  Copyright (c) 2011, Willow Garage, Inc.
+ *  All rights reserved.
+ *
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *
+ *   * Redistributions of source code must retain the above copyright
+ *     notice, this list of conditions and the following disclaimer.
+ *   * Redistributions in binary form must reproduce the above
+ *     copyright notice, this list of conditions and the following
+ *     disclaimer in the documentation and/or other materials provided
+ *     with the distribution.
+ *   * Neither the name of Willow Garage, Inc. nor the names of its
+ *     contributors may be used to endorse or promote products derived
+ *     from this software without specific prior written permission.
+ *
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *  FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *  COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *  INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *  BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *  LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *  CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *  LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *  ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *  POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef FCL_MATH_DETAILS_H
+#define FCL_MATH_DETAILS_H
+
+
+#include <cmath>
+#include <algorithm>
+#include <cstring>
+
+namespace fcl
+{
+
+namespace details
+{
+
+
+template <typename T>
+struct Vec3Data
+{
+  typedef T meta_type;
+
+  T vs[3];
+  Vec3Data() { setValue(0); }
+  Vec3Data(T x)
+  {
+    setValue(x);
+  }
+
+  Vec3Data(T* x)
+  {
+    memcpy(vs, x, sizeof(T) * 3);
+  }
+
+  Vec3Data(T x, T y, T z)
+  {
+    setValue(x, y, z);
+  }
+
+  inline void setValue(T x, T y, T z)
+  {
+    vs[0] = x; vs[1] = y; vs[2] = z;
+  }
+
+  inline void setValue(T x)
+  {
+    vs[0] = x; vs[1] = x; vs[2] = x;
+  }
+
+  inline void negate()
+  {
+    vs[0] = -vs[0]; vs[1] = -vs[1]; vs[2] = -vs[2];
+  }
+
+  T operator [] (size_t i) const { return vs[i]; }
+  T& operator [] (size_t i) { return vs[i]; }
+
+  inline Vec3Data<T> operator + (const Vec3Data<T>& other) const { return Vec3Data<T>(vs[0] + other.vs[0], vs[1] + other.vs[1], vs[2] + other.vs[2]); }
+  inline Vec3Data<T> operator - (const Vec3Data<T>& other) const { return Vec3Data<T>(vs[0] - other.vs[0], vs[1] - other.vs[1], vs[2] - other.vs[2]); }
+  inline Vec3Data<T> operator * (const Vec3Data<T>& other) const { return Vec3Data<T>(vs[0] * other.vs[0], vs[1] * other.vs[1], vs[2] * other.vs[2]); }
+  inline Vec3Data<T> operator / (const Vec3Data<T>& other) const { return Vec3Data<T>(vs[0] / other.vs[0], vs[1] / other.vs[1], vs[2] / other.vs[2]); }
+  inline Vec3Data<T>& operator += (const Vec3Data<T>& other) { vs[0] += other.vs[0]; vs[1] += other.vs[1]; vs[2] += other.vs[2]; return *this; }
+  inline Vec3Data<T>& operator -= (const Vec3Data<T>& other) { vs[0] -= other.vs[0]; vs[1] -= other.vs[1]; vs[2] -= other.vs[2]; return *this; }
+  inline Vec3Data<T>& operator *= (const Vec3Data<T>& other) { vs[0] *= other.vs[0]; vs[1] *= other.vs[1]; vs[2] *= other.vs[2]; return *this; }
+  inline Vec3Data<T>& operator /= (const Vec3Data<T>& other) { vs[0] /= other.vs[0]; vs[1] /= other.vs[1]; vs[2] /= other.vs[2]; return *this; }
+  inline Vec3Data<T> operator + (T t) const { return Vec3Data<T>(vs[0] + t, vs[1] + t, vs[2] + t); }
+  inline Vec3Data<T> operator - (T t) const { return Vec3Data<T>(vs[0] - t, vs[1] - t, vs[2] - t); }
+  inline Vec3Data<T> operator * (T t) const { return Vec3Data<T>(vs[0] * t, vs[1] * t, vs[2] * t); }
+  inline Vec3Data<T> operator / (T t) const { T inv_t = 1.0 / t; return Vec3Data<T>(vs[0] * inv_t, vs[1] * inv_t, vs[2] * inv_t); }
+  inline Vec3Data<T>& operator += (T t) { vs[0] += t; vs[1] += t; vs[2] += t; return *this; }
+  inline Vec3Data<T>& operator -= (T t) { vs[0] -= t; vs[1] -= t; vs[2] -= t; return *this; }
+  inline Vec3Data<T>& operator *= (T t) { vs[0] *= t; vs[1] *= t; vs[2] *= t; return *this; }
+  inline Vec3Data<T>& operator /= (T t) { T inv_t = 1.0 / t; vs[0] *= inv_t; vs[1] *= inv_t; vs[2] *= inv_t; return *this; }
+  inline Vec3Data<T> operator - () const { return Vec3Data<T>(-vs[0], -vs[1], -vs[2]); }
+};
+
+
+template <typename T>
+static inline Vec3Data<T> cross_prod(const Vec3Data<T>& l, const Vec3Data<T>& r)
+{
+  return Vec3Data<T>(l.vs[1] * r.vs[2] - l.vs[2] * r.vs[1], 
+                     l.vs[2] * r.vs[0] - l.vs[0] * r.vs[2],
+                     l.vs[0] * r.vs[1] - l.vs[1] * r.vs[0]);
+}
+
+template <typename T>
+static inline T dot_prod(const Vec3Data<T>& l, const Vec3Data<T>& r)
+{
+  return l.vs[0] * r.vs[0] + l.vs[1] * r.vs[1] + l.vs[2] * r.vs[2];
+}
+
+
+template <typename T>
+static inline Vec3Data<T> min(const Vec3Data<T>& x, const Vec3Data<T>& y)
+{
+  return Vec3Data<T>(std::min(x.vs[0], y.vs[0]), std::min(x.vs[1], y.vs[1]), std::min(x.vs[2], y.vs[2]));
+}
+
+template <typename T>
+static inline Vec3Data<T> max(const Vec3Data<T>& x, const Vec3Data<T>& y)
+{
+  return Vec3Data<T>(std::max(x.vs[0], y.vs[0]), std::max(x.vs[1], y.vs[1]), std::max(x.vs[2], y.vs[2]));
+}
+
+template <typename T>
+static inline Vec3Data<T> abs(const Vec3Data<T>& x)
+{
+  return Vec3Data<T>(std::abs(x.vs[0]), std::abs(x.vs[1]), std::abs(x.vs[2]));
+}
+
+template <typename T>
+static inline bool equal(const Vec3Data<T>& x, const Vec3Data<T>& y, T epsilon)
+{
+  return ((x.vs[0] - y.vs[0] < epsilon) &&
+          (x.vs[0] - y.vs[0] > -epsilon) &&
+          (x.vs[1] - y.vs[1] < epsilon) &&
+          (x.vs[1] - y.vs[1] > -epsilon) &&
+          (x.vs[2] - y.vs[2] < epsilon) &&
+          (x.vs[2] - y.vs[2] > -epsilon));
+}
+
+}
+
+}
+
+#endif
diff --git a/trunk/fcl/include/fcl/math_simd_details.h b/trunk/fcl/include/fcl/math_simd_details.h
new file mode 100644
index 0000000000000000000000000000000000000000..c284864643d0d06c7d9e6a3efb7e04e0ee24b421
--- /dev/null
+++ b/trunk/fcl/include/fcl/math_simd_details.h
@@ -0,0 +1,323 @@
+/*
+ * Software License Agreement (BSD License)
+ *
+ *  Copyright (c) 2011, Willow Garage, Inc.
+ *  All rights reserved.
+ *
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *
+ *   * Redistributions of source code must retain the above copyright
+ *     notice, this list of conditions and the following disclaimer.
+ *   * Redistributions in binary form must reproduce the above
+ *     copyright notice, this list of conditions and the following
+ *     disclaimer in the documentation and/or other materials provided
+ *     with the distribution.
+ *   * Neither the name of Willow Garage, Inc. nor the names of its
+ *     contributors may be used to endorse or promote products derived
+ *     from this software without specific prior written permission.
+ *
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *  FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *  COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *  INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *  BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *  LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *  CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *  LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *  ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *  POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \author Jia Pan */
+
+
+#ifndef FCL_MATH_SIMD_DETAILS_H
+#define FCL_MATH_SIMD_DETAILS_H
+
+#include <xmmintrin.h>
+#if defined (__SSE3__)
+#include <pmmintrin.h>
+#endif
+#if defined (__SSE4__)
+#include <smmintrin.h>
+#endif
+
+
+namespace fcl
+{
+
+/** \brief FCL internals. Ignore this :) unless you are God */
+namespace details
+{
+
+const __m128  xmms_0 = {0.f, 0.f, 0.f, 0.f};
+const __m128d xmmd_0 = {0, 0};
+
+struct sse_meta_f4
+{
+  typedef float meta_type;
+
+  union {float vs[4]; __m128 v; }; 
+  sse_meta_f4() : v(_mm_set1_ps(0)) {}
+  sse_meta_f4(float x) : v(_mm_set1_ps(x)) {}
+  sse_meta_f4(float* px) : v(_mm_load_ps(px)) {}
+  sse_meta_f4(__m128 x) : v(x) {}
+  sse_meta_f4(float x, float y, float z, float w = 0) : v(_mm_setr_ps(x, y, z, w)) {}
+  void setValue(float x, float y, float z, float w = 0) { v = _mm_setr_ps(x, y, z, w); }
+  void setValue(float x) { v = _mm_set1_ps(x); }
+  void negate() { v = _mm_sub_ps(xmms_0, v); }
+  
+  inline void* operator new [] (size_t n) { return _mm_malloc(n, 16); }
+  inline void operator delete [] (void* x) { if(x) _mm_free(x); }
+  inline float operator [] (size_t i) const { return vs[i]; }
+  inline float& operator [] (size_t i) { return vs[i]; }
+
+  inline sse_meta_f4 operator + (const sse_meta_f4& other) const { return sse_meta_f4(_mm_add_ps(v, other.v)); }
+  inline sse_meta_f4 operator - (const sse_meta_f4& other) const { return sse_meta_f4(_mm_sub_ps(v, other.v)); }
+  inline sse_meta_f4 operator * (const sse_meta_f4& other) const { return sse_meta_f4(_mm_mul_ps(v, other.v)); }
+  inline sse_meta_f4 operator / (const sse_meta_f4& other) const { return sse_meta_f4(_mm_div_ps(v, other.v)); }
+  inline sse_meta_f4& operator += (const sse_meta_f4& other) { v = _mm_add_ps(v, other.v); return *this; }
+  inline sse_meta_f4& operator -= (const sse_meta_f4& other) { v = _mm_sub_ps(v, other.v); return *this; }
+  inline sse_meta_f4& operator *= (const sse_meta_f4& other) { v = _mm_mul_ps(v, other.v); return *this; }
+  inline sse_meta_f4& operator /= (const sse_meta_f4& other) { v = _mm_div_ps(v, other.v); return *this; }
+  inline sse_meta_f4 operator + (float t) const { return sse_meta_f4(_mm_add_ps(v, _mm_set1_ps(t))); }
+  inline sse_meta_f4 operator - (float t) const { return sse_meta_f4(_mm_sub_ps(v, _mm_set1_ps(t))); }
+  inline sse_meta_f4 operator * (float t) const { return sse_meta_f4(_mm_mul_ps(v, _mm_set1_ps(t))); }
+  inline sse_meta_f4 operator / (float t) const { return sse_meta_f4(_mm_div_ps(v, _mm_set1_ps(t))); }
+  inline sse_meta_f4& operator += (float t) { v = _mm_add_ps(v, _mm_set1_ps(t)); return *this; }
+  inline sse_meta_f4& operator -= (float t) { v = _mm_sub_ps(v, _mm_set1_ps(t)); return *this; }
+  inline sse_meta_f4& operator *= (float t) { v = _mm_mul_ps(v, _mm_set1_ps(t)); return *this; }
+  inline sse_meta_f4& operator /= (float t) { v = _mm_div_ps(v, _mm_set1_ps(t)); return *this; }
+  inline sse_meta_f4 operator - () const { return sse_meta_f4(_mm_sub_ps(xmms_0, v)); }
+} __attribute__ ((aligned (16)));
+
+struct sse_meta_d4
+{
+  typedef double meta_type;
+
+  union {double vs[4]; __m128d v[2]; };
+  sse_meta_d4()
+  {
+    setValue(0.0);
+  }
+                    
+  sse_meta_d4(double x)
+  {
+    setValue(x);
+  }
+  
+  sse_meta_d4(double* px)
+  {
+    v[0] = _mm_load_pd(px);
+    v[1] = _mm_set_pd(0, *(px + 2));
+  }
+  
+  sse_meta_d4(__m128d x, __m128d y)
+  {
+    v[0] = x;
+    v[1] = y;
+  }
+
+  sse_meta_d4(double x, double y, double z, double w = 0)
+  {
+    setValue(x, y, z, w);
+  }
+
+  inline void setValue(double x, double y, double z, double w = 0)
+  {
+    v[0] = _mm_setr_pd(x, y);
+    v[1] = _mm_setr_pd(z, w);
+  }
+
+  inline void setValue(double x)
+  {
+    v[0] = _mm_set1_pd(x);
+    v[1] = v[0];
+  }
+
+  inline void negate()
+  {
+    v[0] = _mm_sub_pd(xmmd_0, v[0]);
+    v[1] = _mm_sub_pd(xmmd_0, v[1]);
+  }
+
+  inline void* operator new [] (size_t n)
+  {
+    return _mm_malloc(n, 16);
+  }
+
+  inline void operator delete [] (void* x)
+  {
+    if(x) _mm_free(x);
+  }
+
+  double operator [] (size_t i) const { return vs[i]; }
+  double& operator [] (size_t i) { return vs[i]; }
+
+  inline sse_meta_d4 operator + (const sse_meta_d4& other) const { return sse_meta_d4(_mm_add_pd(v[0], other.v[0]), _mm_add_pd(v[1], other.v[1])); }
+  inline sse_meta_d4 operator - (const sse_meta_d4& other) const { return sse_meta_d4(_mm_sub_pd(v[0], other.v[0]), _mm_sub_pd(v[1], other.v[1])); }
+  inline sse_meta_d4 operator * (const sse_meta_d4& other) const { return sse_meta_d4(_mm_mul_pd(v[0], other.v[0]), _mm_mul_pd(v[1], other.v[1])); }
+  inline sse_meta_d4 operator / (const sse_meta_d4& other) const { return sse_meta_d4(_mm_div_pd(v[0], other.v[0]), _mm_div_pd(v[1], other.v[1])); }
+  inline sse_meta_d4& operator += (const sse_meta_d4& other) { v[0] = _mm_add_pd(v[0], other.v[0]); v[1] = _mm_add_pd(v[1], other.v[1]); return *this; }
+  inline sse_meta_d4& operator -= (const sse_meta_d4& other) { v[0] = _mm_sub_pd(v[0], other.v[0]); v[1] = _mm_sub_pd(v[1], other.v[1]); return *this; }
+  inline sse_meta_d4& operator *= (const sse_meta_d4& other) { v[0] = _mm_mul_pd(v[0], other.v[0]); v[1] = _mm_mul_pd(v[1], other.v[1]); return *this; }
+  inline sse_meta_d4& operator /= (const sse_meta_d4& other) { v[0] = _mm_div_pd(v[0], other.v[0]); v[1] = _mm_div_pd(v[1], other.v[1]); return *this; }
+  inline sse_meta_d4 operator + (double t) const { register __m128d d = _mm_set1_pd(t); return sse_meta_d4(_mm_add_pd(v[0], d), _mm_add_pd(v[1], d)); }
+  inline sse_meta_d4 operator - (double t) const { register __m128d d = _mm_set1_pd(t); return sse_meta_d4(_mm_sub_pd(v[0], d), _mm_sub_pd(v[1], d)); }
+  inline sse_meta_d4 operator * (double t) const { register __m128d d = _mm_set1_pd(t); return sse_meta_d4(_mm_mul_pd(v[0], d), _mm_mul_pd(v[1], d)); }
+  inline sse_meta_d4 operator / (double t) const { register __m128d d = _mm_set1_pd(t); return sse_meta_d4(_mm_div_pd(v[0], d), _mm_div_pd(v[1], d)); }
+  inline sse_meta_d4& operator += (double t) { register __m128d d = _mm_set1_pd(t); v[0] = _mm_add_pd(v[0], d); v[1] = _mm_add_pd(v[1], d); return *this; }
+  inline sse_meta_d4& operator -= (double t) { register __m128d d = _mm_set1_pd(t); v[0] = _mm_sub_pd(v[0], d); v[1] = _mm_sub_pd(v[1], d); return *this; }
+  inline sse_meta_d4& operator *= (double t) { register __m128d d = _mm_set1_pd(t); v[0] = _mm_mul_pd(v[0], d); v[1] = _mm_mul_pd(v[1], d); return *this; }
+  inline sse_meta_d4& operator /= (double t) { register __m128d d = _mm_set1_pd(t); v[0] = _mm_div_pd(v[0], d); v[1] = _mm_div_pd(v[1], d); return *this; }
+  inline sse_meta_d4 operator - () const { return sse_meta_d4(_mm_sub_pd(xmmd_0, v[0]), _mm_sub_pd(xmmd_0, v[1])); }
+} __attribute__ ((aligned (16)));
+
+
+static inline sse_meta_f4 cross_prod(const sse_meta_f4& x, const sse_meta_f4& y)
+{
+  // set to a[1][2][0][3] , b[2][0][1][3]
+  // multiply
+  static const int s1 = _MM_SHUFFLE(3, 0, 2, 1);
+  static const int s2 = _MM_SHUFFLE(3, 1, 0, 2);
+  __m128 xa = _mm_mul_ps(_mm_shuffle_ps(x.v, x.v, s1), _mm_shuffle_ps(y.v, y.v, s2));
+
+  // set to a[2][0][1][3] , b[1][2][0][3]
+  // multiply
+  __m128 xb = _mm_mul_ps(_mm_shuffle_ps(x.v, x.v, s2), _mm_shuffle_ps(y.v, y.v, s1));
+
+  // subtract
+  return sse_meta_f4(_mm_sub_ps(xa, xb));
+}
+
+static inline sse_meta_d4 cross_prod(const sse_meta_d4& x, const sse_meta_d4& y)
+{
+  static const int s0 = _MM_SHUFFLE2(0, 0);
+  static const int s1 = _MM_SHUFFLE2(0, 1);
+  static const int s2 = _MM_SHUFFLE2(1, 0);
+  static const int s3 = _MM_SHUFFLE2(1, 1);  
+  __m128d xa1 = _mm_mul_pd(_mm_shuffle_pd(x.v[0], x.v[1], s1), _mm_shuffle_pd(y.v[1], y.v[0], s0));
+  __m128d ya1 = _mm_mul_pd(_mm_shuffle_pd(x.v[0], x.v[1], s2), _mm_shuffle_pd(y.v[0], y.v[1], s3));
+  
+  __m128d xa2 = _mm_mul_pd(_mm_shuffle_pd(x.v[1], x.v[0], s0), _mm_shuffle_pd(y.v[0], y.v[1], s1));
+  __m128d ya2 = _mm_mul_pd(_mm_shuffle_pd(x.v[0], y.v[1], s3), _mm_shuffle_pd(y.v[0], y.v[1], s2));
+
+  return sse_meta_d4(_mm_sub_pd(xa1, xa2), _mm_sub_pd(ya1, ya2));
+}
+
+static inline float dot_prod(const sse_meta_f4& x, const sse_meta_f4& y)
+{
+#if defined (__SSE4__)
+  return _mm_cvtss_f32(_mm_dp_ps(x.v, y.v, 0x71));
+#elif defined (__SSE3__)
+  register __m128 t = _mm_mul_ps(x.v, y.v);
+  t = _mm_hadd_ps(t, t);
+  t = _mm_hadd_ps(t, t);
+  return _mm_cvtss_f32(t);
+#else
+  register __m128 s = _mm_mul_ps(x.v, y.v);
+  register __m128 r = _mm_add_ss(s, _mm_movehl_ps(s, s));
+  r = _mm_add_ss(r, _mm_shuffle_ps(r, r, 1));
+  return _mm_cvtss_f32(r);
+#endif
+}
+
+static inline double dot_prod(const sse_meta_d4& x, const sse_meta_d4& y)
+{
+#if defined (__SSE4__)
+  register __m128d t1 = _mm_dp_pd(x.v[0], y.v[0], 0x31);
+  register __m128d t2 = _mm_dp_pd(x.v[1], y.v[1], 0x11);
+  t1 = _mm_add_pd(t1, t2);
+  double d;
+  _mm_storel_pd(&d, t1);
+  return d;
+#elif defined (__SSE3__)
+  double d;
+  register __m128d t1 = _mm_mul_pd(x.v[0], y.v[0]);
+  register __m128d t2 = _mm_mul_pd(x.v[1], y.v[1]);
+  t1 = _mm_hadd_pd(t1, t1);
+  t2 = _mm_hadd_pd(t2, t2);
+  t1 = _mm_add_pd(t1, t2);
+  _mm_storeh_pl(&d, t1);
+  return d;
+#else 
+  double d;
+  register __m128d t1 = _mm_mul_pd(x.v[0], y.v[0]);
+  register __m128d t2 = _mm_mul_pd(x.v[1], y.v[1]);
+  t1 = _mm_add_pd(t1, t2);
+  t1 = _mm_add_pd(t1, _mm_shuffle_pd(t1, t1, 1));
+  _mm_storel_pd(&d, t1);
+  return d;
+#endif
+}
+
+static inline sse_meta_f4 min(const sse_meta_f4& x, const sse_meta_f4& y)
+{
+  return sse_meta_f4(_mm_min_ps(x.v, y.v));
+}
+
+static inline sse_meta_d4 min(const sse_meta_d4& x, const sse_meta_d4& y)
+{
+  return sse_meta_d4(_mm_min_pd(x.v[0], y.v[0]), _mm_min_pd(x.v[1], y.v[1]));
+}
+
+static inline sse_meta_f4 max(const sse_meta_f4& x, const sse_meta_f4& y)
+{
+  return sse_meta_f4(_mm_max_ps(x.v, y.v));
+}
+
+static inline sse_meta_d4 max(const sse_meta_d4& x, const sse_meta_d4& y)
+{
+  return sse_meta_d4(_mm_max_pd(x.v[0], y.v[0]), _mm_max_pd(x.v[1], y.v[1]));
+}
+
+static inline sse_meta_f4 abs(const sse_meta_f4& x)
+{
+  static const union { int i[4]; __m128 m; } abs4mask = {0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff};
+  return sse_meta_f4(_mm_and_ps(x.v, abs4mask.m));
+}
+
+static inline sse_meta_d4 abs(const sse_meta_d4& x)
+{
+  static const union { int64_t i[2]; __m128d m; } abs2mask = {0x7fffffffffffffff, 0x7fffffffffffffff};
+  return sse_meta_d4(_mm_and_pd(x.v[0], abs2mask.m), _mm_and_pd(x.v[1], abs2mask.m));
+}
+
+static inline bool equal(const sse_meta_f4& x, const sse_meta_f4& y, float epsilon)
+{
+  epsilon = 1e-3;
+  //static const union { int i[4]; __m128 m; } abs4mask = {0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff};
+  //return (_mm_movemask_ps(_mm_cmplt_ps(_mm_and_ps(_mm_sub_ps(x.v, y.v), abs4mask.m), _mm_set1_ps(epsilon))) == 0x0f);
+  register __m128 d = _mm_sub_ps(x.v, y.v);
+  register __m128 e = _mm_set1_ps(epsilon);
+  return ((_mm_movemask_ps(_mm_cmplt_ps(d, e)) & 0x7) == 0x7) && ((_mm_movemask_ps(_mm_cmpgt_ps(d, _mm_sub_ps(xmms_0, e))) & 0x7) == 0x7);
+}
+
+static inline bool equal(const sse_meta_d4& x, const sse_meta_d4& y, double epsilon)
+{
+  // static const union { int64 i[2]; __m128d m; } abs2mask = {0x7fffffffffffffff, 0x7fffffffffffffff};
+  // return (_mm_movemask_pd(_mm_cmplt_pd(_mm_and_pd(_mm_sub_pd(x.v[0], y.v[0]), abs2mask.m), _mm_set1_pd(epsilon))) == 0x3) && (_mm_movemask_pd(_mm_cmplt_pd(_mm_and_pd(_mm_sub_pd(x.v[1], y.v[1]), abs2mask.m), _mm_set1_pd(epsilon))) == 0x3);
+
+  register __m128d d = _mm_sub_pd(x.v[0], y.v[0]);
+  register __m128d e = _mm_set1_pd(epsilon);
+  
+  if(_mm_movemask_pd(_mm_cmplt_pd(d, e)) != 0x3) return false;
+  if(_mm_movemask_pd(_mm_cmpgt_pd(d, _mm_sub_pd(xmmd_0, e))) != 0x3) return false;
+  
+  d = _mm_sub_pd(x.v[1], y.v[1]);
+  if((_mm_movemask_pd(_mm_cmplt_pd(d, e)) & 0x1) != 0x1) return false;
+  if((_mm_movemask_pd(_mm_cmpgt_pd(d, _mm_sub_pd(xmmd_0, e))) & 0x1) != 0x1) return false;
+  return true;
+}
+
+
+
+} // details
+} // fcl
+
+
+#endif
diff --git a/trunk/fcl/src/vec_3f.cpp b/trunk/fcl/include/fcl/simd.h
similarity index 68%
rename from trunk/fcl/src/vec_3f.cpp
rename to trunk/fcl/include/fcl/simd.h
index d95176794d1047da48442f54e6f281b3e9c59019..1cb1e4057a6b31efc1571347129f7a0ccdfdcc00 100644
--- a/trunk/fcl/src/vec_3f.cpp
+++ b/trunk/fcl/include/fcl/simd.h
@@ -34,42 +34,9 @@
 
 /** \author Jia Pan */
 
-#include "fcl/vec_3f.h"
-#include <iostream>
+#ifndef FCL_SIMD_H
+#define FCL_SIMD_H
 
-namespace fcl
-{
-#if COLLISION_USE_SSE
-const float Vec3f::EPSILON = 1e-11;
-#else
-const BVH_REAL Vec3f::EPSILON = 1e-11;
-#endif
-
-
-void generateCoordinateSystem(const Vec3f& w, Vec3f& u, Vec3f& v)
-{
-  BVH_REAL inv_length;
-  if(fabs(w[0]) >= fabs(w[1]))
-  {
-    inv_length = (BVH_REAL)1.0 / sqrt(w[0] * w[0] + w[2] * w[2]);
-    u[0] = -w[2] * inv_length;
-    u[1] = (BVH_REAL)0;
-    u[2] = w[0] * inv_length;
-    v[0] = w[1] * u[2];
-    v[1] = w[2] * u[0] - w[0] * u[2];
-    v[2] = -w[1] * u[0];
-  }
-  else
-  {
-    inv_length = (BVH_REAL)1.0 / sqrt(w[1] * w[1] + w[2] * w[2]);
-    u[0] = (BVH_REAL)0;
-    u[1] = w[2] * inv_length;
-    u[2] = -w[1] * inv_length;
-    v[0] = w[1] * u[2] - w[2] * u[1];
-    v[1] = -w[0] * u[2];
-    v[2] = w[0] * u[1];
-  }
-}
+#include "fcl/math_simd_details.h"
 
-
-}
+#endif
diff --git a/trunk/fcl/include/fcl/vec_3f.h b/trunk/fcl/include/fcl/vec_3f.h
index 86f79dd32f9f0fd079faa3a6b8e0f16b59bf2e44..60d2256d859a8f18a7b3c9f105a058c52a9999e3 100644
--- a/trunk/fcl/include/fcl/vec_3f.h
+++ b/trunk/fcl/include/fcl/vec_3f.h
@@ -34,500 +34,146 @@
 
 /** \author Jia Pan */
 
-#ifndef COLLISION_CHECKING_VEC_3F_H
-#define COLLISION_CHECKING_VEC_3F_H
+#ifndef FCL_VEC_3F_H
+#define FCL_VEC_3F_H
 
 #include "fcl/BVH_internal.h"
+#include "fcl/math_details.h"
 #include <cmath>
-#include <cstdlib>
-#include <algorithm>
-#include <cstring>
+#include <iostream>
+#include <limits>
+
 
-/** \brief Main namespace */
 namespace fcl
 {
 
-#if COLLISION_USE_SSE
-#include <xmmintrin.h>
-#include <pmmintrin.h>
-  inline __m128 _mm_cross_ps(__m128 a , __m128 b)
-  {
-    // set to a[1][2][0][3] , b[2][0][1][3]
-    // multiply
-    __m128 xa = _mm_mul_ps(
-                _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 0, 2, 1)),
-                _mm_shuffle_ps(b, b, _MM_SHUFFLE(3, 1, 0, 2)));
-
-    // set to a[2][0][1][3] , b[1][2][0][3]
-    // multiply
-    __m128 xb = _mm_mul_ps(
-                _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 1, 0, 2)),
-                _mm_shuffle_ps(b, b, _MM_SHUFFLE(3, 0, 2, 1 )));
-
-    // subtract
-    return _mm_sub_ps(xa, xb);
-  }
-
-  const __m128  xmms_0 = {0.f, 0.f, 0.f, 0.f};
-
-  union ieee754_QNAN
-  {
-    const float f;
-    struct
-    {
-      const unsigned int mantissa:23, exp:8, sign:1;
-    };
-
-    ieee754_QNAN() : f(0.0f), mantissa(0x7FFFFF), exp(0xFF), sign(0x0) {}
-  } __attribute__ ((aligned (16)));
-
-
-  class Vec3f
+template <typename T>
+class Vec3fX
+{
+public:
+  typedef typename T::meta_type U;
+  T data;
+
+  Vec3fX() {}
+  Vec3fX(const Vec3fX& other) : data(other.data) {}
+  Vec3fX(U* v) : data(v) {}
+  Vec3fX(U x, U y, U z) : data(x, y, z) {}
+  Vec3fX(const T& data_) : data(data_) {}
+
+  inline U operator [] (size_t i) const { return data[i]; }
+  inline U& operator [] (size_t i) { return data[i]; }
+
+  inline Vec3fX operator + (const Vec3fX& other) const { return Vec3fX(data + other.data); }
+  inline Vec3fX operator - (const Vec3fX& other) const { return Vec3fX(data - other.data); }
+  inline Vec3fX operator * (const Vec3fX& other) const { return Vec3fX(data * other.data); }
+  inline Vec3fX operator / (const Vec3fX& other) const { return Vec3fX(data / other.data); }
+  inline Vec3fX& operator += (const Vec3fX& other) { data += other.data; return *this; }
+  inline Vec3fX& operator -= (const Vec3fX& other) { data -= other.data; return *this; }
+  inline Vec3fX& operator *= (const Vec3fX& other) { data *= other.data; return *this; }
+  inline Vec3fX& operator /= (const Vec3fX& other) { data /= other.data; return *this; }
+  inline Vec3fX operator + (U t) const { return Vec3fX(data + t); }
+  inline Vec3fX operator - (U t) const { return Vec3fX(data - t); }
+  inline Vec3fX operator * (U t) const { return Vec3fX(data * t); }
+  inline Vec3fX operator / (U t) const { return Vec3fX(data / t); }
+  inline Vec3fX& operator += (U t) { data += t; return *this; }
+  inline Vec3fX& operator -= (U t) { data -= t; return *this; }
+  inline Vec3fX& operator *= (U t) { data *= t; return *this; }
+  inline Vec3fX& operator /= (U t) { data /= t; return *this; }
+  inline Vec3fX operator - () { return Vec3fX(-data); }
+  inline Vec3fX cross(const Vec3fX& other) const { return Vec3fX(details::cross_prod(data, other.data)); }
+  inline U dot(const Vec3fX& other) const { return details::dot_prod(data, other.data); }
+  inline bool normalize()
   {
-  public:
-    /** \brief vector data */
-    union {float v_[4]; __m128 v4; } __attribute__ ((aligned (16)));
-
-    Vec3f() { v4 = _mm_set1_ps(0); }
-
-    Vec3f(const Vec3f& other)
-    {
-      v4 = other.v4;
-    }
-
-    Vec3f(const float* v)
-    {
-      v_[0] = v[0];
-      v_[1] = v[1];
-      v_[2] = v[2];
-      v_[3] = 0.0f;
-    }
-
-    Vec3f(float x, float y, float z)
-    {
-      v_[0] = x;
-      v_[1] = y;
-      v_[2] = z;
-      v_[3] = 0.0f;
-    }
-
-
-    Vec3f(__m128 v)
+    U sqr_length = details::dot_prod(data, data);
+    if(sqr_length > 0)
     {
-      v4 = v;
+      *this /= (U)sqrt(sqr_length);
+      return true;
     }
-
-    ~Vec3f() {}
-
-    /** \brief Get the ith element */
-    inline float operator [] (size_t i) const
-    {
-      return v_[i];
-    }
-
-    inline float& operator[] (size_t i)
-    {
-      return v_[i];
-    }
-
-    /** \brief Add the other vector */
-    inline Vec3f& operator += (const Vec3f& other)
-    {
-      v4 = _mm_add_ps(v4, other.v4);
-      return *this;
-    }
-
-
-    /** \brief Minus the other vector */
-    inline Vec3f& operator -= (const Vec3f& other)
-    {
-      v4 = _mm_sub_ps(v4, other.v4);
-      return *this;
-    }
-
-    inline Vec3f& operator *= (BVH_REAL t)
-    {
-      v4 = _mm_mul_ps(_mm_set1_ps(t), v4);
-      return *this;
-    }
-
-    /** \brief Negate the vector */
-    inline void negate()
-    {
-      v4 = _mm_sub_ps(xmms_0, v4);
-    }
-
-    /** \brief Return a negated vector */
-    inline Vec3f operator - () const
-    {
-      return Vec3f(_mm_sub_ps(xmms_0, v4));
-    }
-
-
-    /** \brief Return a summation vector */
-    inline Vec3f operator + (const Vec3f& other) const
-    {
-      return Vec3f(_mm_add_ps(v4, other.v4));
-    }
-
-    /** \brief Return a substraction vector */
-    inline Vec3f operator - (const Vec3f& other) const
-    {
-      return Vec3f(_mm_sub_ps(v4, other.v4));
-    }
-
-    /** \brief Scale the vector by t */
-    inline Vec3f operator * (float t) const
-    {
-      return Vec3f(_mm_mul_ps(_mm_set1_ps(t), v4));
-    }
-
-    /** \brief Return the cross product with another vector */
-    inline Vec3f cross(const Vec3f& other) const
-    {
-      return Vec3f(_mm_cross_ps(v4, other.v4));
-    }
-
-    /** \brief Return the dot product with another vector */
-    inline float dot(const Vec3f& other) const
-    {
-      float d;
-      register __m128 t = _mm_mul_ps(other.v4, v4);
-      t = _mm_hadd_ps(t, t);
-      t = _mm_hadd_ps(t, t);
-      _mm_store_ss(&d, t);
-      return d;
-    }
-
-    /** \brief Normalization */
-    inline bool normalize()
-    {
-      float sqr_length = sqrLength();
-      if(sqr_length > EPSILON)
-      {
-        float inv_length = 1.0 / sqrt(sqr_length);
-        v4 = _mm_mul_ps(_mm_set1_ps(inv_length), v4);
-        return true;
-      }
+    else
       return false;
-    }
-
-    inline Vec3f normalized() const
-    {
-      BVH_REAL sqr_length = v_[0] * v_[0] + v_[1] * v_[1] + v_[2] * v_[2];
-      if(sqr_length > EPSILON * EPSILON)
-      {
-        BVH_REAL inv_length = (BVH_REAL)1.0 / (BVH_REAL)sqrt(sqr_length);
-        return *this * inv_length;
-      }
-      else
-      {
-        return *this;
-      }
-    }
-
-    /** \brief Return vector length */
-    inline float length() const
-    {
-      return sqrt(sqrLength());
-    }
-
-    /** \brief Return vector square length */
-    inline float sqrLength() const
-    {
-      float d;
-      register __m128 t = _mm_mul_ps(v4, v4);
-      t = _mm_hadd_ps(t, t);
-      t = _mm_hadd_ps(t, t);
-      _mm_store_ss(&d, t);
-      return d;
-    }
-
-    /** \brief Set the vector using new values */
-    inline void setValue(float x, float y, float z)
-    {
-      v_[0] = x; v_[1] = y; v_[2] = z; v_[3] = 0.0f;
-    }
-
-    /** \brief Set the vector using new values */
-    inline void setValue(BVH_REAL x)
-    {
-      v_[0] = x; v_[1] = x; v_[2] = x; v_[3] = 0.0f;
-    }
-
-    /** \brief Check whether two vectors are the same in abstracted value */
-    inline bool equal(const Vec3f& other) const
-    {
-      return ((v_[0] - other.v_[0] < EPSILON) &&
-             (v_[0] - other.v_[0] > -EPSILON) &&
-             (v_[1] - other.v_[1] < EPSILON) &&
-             (v_[1] - other.v_[1] > -EPSILON) &&
-             (v_[2] - other.v_[2] < EPSILON) &&
-             (v_[2] - other.v_[2] > -EPSILON));
-    }
-
-    inline BVH_REAL triple(const Vec3f& v1, const Vec3f& v2) const
-    {
-      return v_[0] * (v1.v_[1] * v2.v_[2] - v1.v_[2] * v2.v_[1]) +
-          v_[1] * (v1.v_[2] * v2.v_[0] - v1.v_[0] * v2.v_[2]) +
-          v_[2] * (v1.v_[0] * v2.v_[1] - v1.v_[1] * v2.v_[0]);
-    }
-
-  private:
-    /** \brief Tolerance for comparision */
-    static const float EPSILON;
-
-  };
-
-
-  /** \brief The minimum of two vectors */
-  inline Vec3f min(const Vec3f& a, const Vec3f& b)
-  {
-    Vec3f ret(_mm_min_ps(a.v4, b.v4));
-    return ret;
-  }
-
-  /** \brief the maximum of two vectors */
-  inline Vec3f max(const Vec3f& a, const Vec3f& b)
-  {
-    Vec3f ret(_mm_max_ps(a.v4, b.v4));
-    return ret;
   }
 
-  inline Vec3f abs(const Vec3f& v)
+  inline Vec3fX normalized() const
   {
-    const ieee754_QNAN mask;
-    __m128 abs4mask = _mm_load1_ps(&mask.f);
-    return Vec3f(_mm_and_ps(abs4mask, v.v4));
-  }
-#else
-/** \brief A class describing a three-dimensional vector */
-  class Vec3f
-  {
-  public:
-    /** \brief vector data */
-    BVH_REAL v_[3];
-
-    Vec3f() { v_[0] = 0; v_[1] = 0; v_[2] = 0; }
-
-    Vec3f(const Vec3f& other)
-    {
-      memcpy(v_, other.v_, sizeof(BVH_REAL) * 3);
-    }
-
-    Vec3f(const BVH_REAL* v)
-    {
-      memcpy(v_, v, sizeof(BVH_REAL) * 3);
-    }
-
-    Vec3f(BVH_REAL x, BVH_REAL y, BVH_REAL z)
-    {
-      v_[0] = x;
-      v_[1] = y;
-      v_[2] = z;
-    }
-
-    ~Vec3f() {}
-
-    /** \brief Get the ith element */
-    inline BVH_REAL operator [] (size_t i) const
-    {
-      return v_[i];
-    }
-
-    inline BVH_REAL& operator[] (size_t i)
-    {
-      return v_[i];
-    }
-
-    /** \brief Add the other vector */
-    inline Vec3f& operator += (const Vec3f& other)
-    {
-      v_[0] += other.v_[0];
-      v_[1] += other.v_[1];
-      v_[2] += other.v_[2];
+    U sqr_length = details::dot_prod(data, data);
+    if(sqr_length > 0)
+      return *this / (U)sqrt(sqr_length);
+    else
       return *this;
-    }
-
-
-    /** \brief Minus the other vector */
-    inline Vec3f& operator -= (const Vec3f& other)
-    {
-      v_[0] -= other.v_[0];
-      v_[1] -= other.v_[1];
-      v_[2] -= other.v_[2];
-      return *this;
-    }
-
-    inline Vec3f& operator *= (BVH_REAL t)
-    {
-      v_[0] *= t;
-      v_[1] *= t;
-      v_[2] *= t;
-      return *this;
-    }
-
-    /** \brief Negate the vector */
-    inline Vec3f& negate()
-    {
-      v_[0] = - v_[0];
-      v_[1] = - v_[1];
-      v_[2] = - v_[2];
-      return *this;
-    }
-
-    /** \brief Return a negated vector */
-    inline Vec3f operator - () const
-    {
-      return Vec3f(-v_[0], -v_[1], -v_[2]);
-    }
-
-    /** \brief Return a summation vector */
-    inline Vec3f operator + (const Vec3f& other) const
-    {
-      return Vec3f(v_[0] + other.v_[0], v_[1] + other.v_[1], v_[2] + other.v_[2]);
-    }
-
-    /** \brief Return a substraction vector */
-    inline Vec3f operator - (const Vec3f& other) const
-    {
-      return Vec3f(v_[0] - other.v_[0], v_[1] - other.v_[1], v_[2] - other.v_[2]);
-    }
-
-    /** \brief Scale the vector by t */
-    inline Vec3f operator * (BVH_REAL t) const
-    {
-      return Vec3f(v_[0] * t, v_[1] * t, v_[2] * t);
-    }
-
-    /** \brief Return the cross product with another vector */
-    inline Vec3f cross(const Vec3f& other) const
-    {
-      return Vec3f(v_[1] * other.v_[2] - v_[2] * other.v_[1],
-                    v_[2] * other.v_[0] - v_[0] * other.v_[2],
-                    v_[0] * other.v_[1] - v_[1] * other.v_[0]);
-    }
-
-    /** \brief Return the dot product with another vector */
-    inline BVH_REAL dot(const Vec3f& other) const
-    {
-      return v_[0] * other.v_[0] + v_[1] * other.v_[1] + v_[2] * other.v_[2];
-    }
-
-    /** \brief Normalization */
-    inline bool normalize()
-    {
-      BVH_REAL sqr_length = v_[0] * v_[0] + v_[1] * v_[1] + v_[2] * v_[2];
-      if(sqr_length > EPSILON * EPSILON)
-      {
-        BVH_REAL inv_length = (BVH_REAL)1.0 / (BVH_REAL)sqrt(sqr_length);
-        v_[0] *= inv_length;
-        v_[1] *= inv_length;
-        v_[2] *= inv_length;
-        return true;
-      }
-      return false;
-    }
-
-    inline Vec3f normalized() const
-    {
-      BVH_REAL sqr_length = v_[0] * v_[0] + v_[1] * v_[1] + v_[2] * v_[2];
-      if(sqr_length > EPSILON * EPSILON)
-      {
-        BVH_REAL inv_length = (BVH_REAL)1.0 / (BVH_REAL)sqrt(sqr_length);
-        return *this * inv_length;
-      }
-      else
-      {
-        return *this;
-      }
-    }
-
-
-    /** \brief Return vector length */
-    inline BVH_REAL length() const
-    {
-      return sqrt(v_[0] * v_[0] + v_[1] * v_[1] + v_[2] * v_[2]);
-    }
-
-    /** \brief Return vector square length */
-    inline BVH_REAL sqrLength() const
-    {
-      return v_[0] * v_[0] + v_[1] * v_[1] + v_[2] * v_[2];
-    }
-
-    /** \brief Set the vector using new values */
-    inline void setValue(BVH_REAL x, BVH_REAL y, BVH_REAL z)
-    {
-      v_[0] = x; v_[1] = y; v_[2] = z;
-    }
+  }
 
-    /** \brief Set the vector using new values */
-    inline void setValue(BVH_REAL x)
-    {
-      v_[0] = x; v_[1] = x; v_[2] = x;
-    }
+  inline U length() const { return sqrt(details::dot_prod(data, data)); }
+  inline U sqrLength() const { return details::dot_prod(data, data); }
+  inline void setValue(U x, U y, U z) { data.setValue(x, y, z); }
+  inline void setValue(U x) { data.setValue(x); }
+  inline bool equal(const Vec3fX& other, U epsilon = std::numeric_limits<U>::epsilon() * 100) const { return details::equal(data, other.data, epsilon); }
+  inline void negate() { data.negate(); }
+};
 
-    /** \brief Check whether two vectors are the same in value */
-    inline bool equal(const Vec3f& other) const
-    {
-      return ((v_[0] - other.v_[0] < EPSILON) &&
-             (v_[0] - other.v_[0] > -EPSILON) &&
-             (v_[1] - other.v_[1] < EPSILON) &&
-             (v_[1] - other.v_[1] > -EPSILON) &&
-             (v_[2] - other.v_[2] < EPSILON) &&
-             (v_[2] - other.v_[2] > -EPSILON));
-    }
+template <typename T>
+static inline typename T::meta_type triple(const Vec3fX<T>& x, const Vec3fX<T>& y, const Vec3fX<T>& z)
+{
+  return x.dot(y.cross(z));
+}
 
-    inline BVH_REAL triple(const Vec3f& v1, const Vec3f& v2) const
-    {
-      return v_[0] * (v1.v_[1] * v2.v_[2] - v1.v_[2] * v2.v_[1]) +
-          v_[1] * (v1.v_[2] * v2.v_[0] - v1.v_[0] * v2.v_[2]) +
-          v_[2] * (v1.v_[0] * v2.v_[1] - v1.v_[1] * v2.v_[0]);
-    }
+template <typename T>
+std::ostream& operator << (std::ostream& out, const Vec3fX<T>& x)
+{
+  out << x[0] << " " << x[1] << " " << x[2];
+  return out;
+}
 
-  private:
-    /** \brief Tolerance for comparision */
-    static const BVH_REAL EPSILON;
-  };
+template <typename T>
+static inline Vec3fX<T> min(const Vec3fX<T>& x, const Vec3fX<T>& y)
+{
+  return Vec3fX<T>(details::min(x.data, y.data));
+}
 
+template <typename T>
+static inline Vec3fX<T> max(const Vec3fX<T>& x, const Vec3fX<T>& y)
+{
+  return Vec3fX<T>(details::max(x.data, y.data));
+}
 
-  /** \brief The minimum of two vectors */
-  inline Vec3f min(const Vec3f& a, const Vec3f& b)
-  {
-    Vec3f ret(std::min(a[0], b[0]), std::min(a[1], b[1]), std::min(a[2], b[2]));
-    return ret;
-  }
+template <typename T>
+static inline Vec3fX<T> abs(const Vec3fX<T>& x)
+{
+  return Vec3fX<T>(details::abs(x.data));
+}
 
-  /** \brief the maximum of two vectors */
-  inline Vec3f max(const Vec3f& a, const Vec3f& b)
+template <typename T>
+void generateCoordinateSystem(const Vec3fX<T>& w, Vec3fX<T>& u, Vec3fX<T>& v)
+{
+  typedef typename T::meta_type U;
+  U inv_length;
+  if(std::abs(w[0]) >= std::abs(w[1]))
   {
-    Vec3f ret(std::max(a[0], b[0]), std::max(a[1], b[1]), std::max(a[2], b[2]));
-    return ret;
+    inv_length = (U)1.0 / sqrt(w[0] * w[0] + w[2] * w[2]);
+    u[0] = -w[2] * inv_length;
+    u[1] = (U)0;
+    u[2] = w[0] * inv_length;
+    v[0] = w[1] * u[2];
+    v[1] = w[2] * u[0] - w[0] * u[2];
+    v[2] = -w[1] * u[0];
   }
-
-  inline Vec3f abs(const Vec3f& v)
+  else
   {
-    BVH_REAL x = v[0] < 0 ? -v[0] : v[0];
-    BVH_REAL y = v[1] < 0 ? -v[1] : v[1];
-    BVH_REAL z = v[2] < 0 ? -v[2] : v[2];
-
-    return Vec3f(x, y, z);
+    inv_length = (U)1.0 / sqrt(w[1] * w[1] + w[2] * w[2]);
+    u[0] = (U)0;
+    u[1] = w[2] * inv_length;
+    u[2] = -w[1] * inv_length;
+    v[0] = w[1] * u[2] - w[2] * u[1];
+    v[1] = -w[0] * u[2];
+    v[2] = w[0] * u[1];
   }
+}
 
-#endif
-
-  inline BVH_REAL triple(const Vec3f& a, const Vec3f& b, const Vec3f& c)
-  {
-    return a.triple(b, c);
-  }
 
-  /** \brief generate a coordinate given a vector (i.e., generating three orthonormal vectors given a vector)
-   * w should be normalized
-   */
-  void generateCoordinateSystem(const Vec3f& w, Vec3f& u, Vec3f& v);
+typedef Vec3fX<details::Vec3Data<BVH_REAL> > Vec3f;
 
 
 }
 
+
 #endif