merged

954b07ab · Ian Goodfellow · e06a39ea · 5d1217d7 · 954b07ab · 954b07ab
--- a/theano/misc/check_blas.py
+++ b/theano/misc/check_blas.py
@@ -91,38 +91,48 @@ if __name__ == "__main__":
        Cpu tested: Xeon E5345(2.33Ghz, 8M L2 cache, 1333Mhz FSB), Xeon E5430(2.66Ghz, 12M L2 cache, 1333Mhz FSB),
                    Xeon E5450(3Ghz, 12M L2 cache, 1333Mhz FSB), Xeon X5560(2.8Ghz, 12M L2 cache, 6.4GT/s QPI, hyper-threads enabled?)
                    Core 2 E8500, Core i7 930(2.8Ghz, hyper-threads enabled), Core i7 950(3.07GHz, hyper-threads enabled)
+                    Xeon X5550(2.67GHz, 8M l2 cache?, hyper-threads enabled)
        Lib tested:
            * numpy with ATLAS from distribution(FC9) package (1 thread)
            * manually compiled numpy and ATLAS with 2 threads
            * goto 1.26 with 1, 2, 4 and 8 threads.
-                          Xeon   Xeon   Xeon  Core2 i7    i7     Xeon
+            * goto2 1.13 compiled with multiple thread enabled.
-        lib/nb threads    E5345  E5430  E5450 E8500 930   950    X5560
+                          Xeon   Xeon   Xeon  Core2 i7    i7     Xeon   Xeon
+        lib/nb threads    E5345  E5430  E5450 E8500 930   950    X5560  X5550
+        numpy 1.3.0 blas                                                775.92s
        numpy_FC9_atlas/1 39.2s  35.0s  30.7s 29.6s 21.5s 19.60s
        goto/1            18.7s  16.1s  14.2s 13.7s 16.1s 14.67s
-        numpy_MAN_atlas/2 12.0s  11.6s  10.2s 9.2s  9.0s
+        numpy_MAN_atlas/2 12.0s  11.6s  10.2s  9.2s  9.0s
-        goto/2            9.5s   8.1s   7.1s  7.3s  8.1s  7.4s
+        goto/2             9.5s   8.1s   7.1s  7.3s  8.1s  7.4s
-        goto/4            4.9s   4.4s   3.7s  -     4.1s  3.8s
+        goto/4             4.9s   4.4s   3.7s  -     4.1s  3.8s
-        goto/8            2.7s   2.4s   2.0s  -     4.1s  3.8s
+        goto/8             2.7s   2.4s   2.0s  -     4.1s  3.8s
        openblas/1                                        14.04s
-        openblas/2                                        7.16s
+        openblas/2                                         7.16s
-        openblas/4                                        3.71s
+        openblas/4                                         3.71s
-        openblas/8                                        3.70s
+        openblas/8                                         3.70s
+        mkl 11.0.083/1            7.97s
        mkl 10.2.2.025/1                                         13.7s
-        mkl 10.2.2.025/2                                         7.6s
+        mkl 10.2.2.025/2                                          7.6s
-        mkl 10.2.2.025/4                                         4.0s
+        mkl 10.2.2.025/4                                          4.0s
-        mkl 10.2.2.025/8                                         2.0s
+        mkl 10.2.2.025/8                                          2.0s
-        mkl 11.0.083/1           7.97s
+        goto2 1.13/1                                                     14.37s
+        goto2 1.13/2                                                      7.26s
+        goto2 1.13/4                                                      3.70s
+        goto2 1.13/8                                                      1.94s
+        goto2 1.13/16                                                     3.16s
        Test time in float32 with cuda 3.0.14
        (cuda version 3.2RC and up are supposed to have faster gemm on the GTX4?? card)
-        cpu/cuda version
+        gpu/cuda version
        GTX580/3.2        0.20s
        GTX480/3.2        0.24s
        GTX480/3.0        0.27s
        GTX470/3.2        0.29s
+        M2070/3.2         0.32s
        GTX470/3.0        0.34s
        GTX285/3.0        0.40s
        GT220/3.2RC       5.15s

--- a/theano/sandbox/cuda/cuda_ndarray.cu
+++ b/theano/sandbox/cuda/cuda_ndarray.cu
@@ -636,7 +636,9 @@ PyObject * CudaNdarray_Reshape(CudaNdarray * self, PyObject * shape)
    }
    if (rval_size==0)
    {
-        return CudaNdarray_NewDims(rval_nd, rval_dims);
+        PyObject * rval = CudaNdarray_NewDims(rval_nd, rval_dims);
+	free(rval_dims);
+	return rval;
    }
    if(CudaNdarray_is_c_contiguous(self))