HaveF · May 14, 2013 12:29
diff --git a/check_blas.py b/check_blas.py
 D:\OpenCourses\ufldl\ref\Theano\theano\misc>python check_blas.py
 Forcing DISTUTILS_USE_SDK=1
 WARNING (theano.tensor.blas): Failed to import scipy.linalg.blas. Falling back o
 n slower implementations (DLL load failed: 找不到指定的模块。)

        Some results that you can compare against. They were 10 executions
        of gemm in float64 with matrices of shape 2000x2000 (M=N=K=2000).
        All memory layout was in C order.

        CPU tested: Xeon E5345(2.33Ghz, 8M L2 cache, 1333Mhz FSB),
                    Xeon E5430(2.66Ghz, 12M L2 cache, 1333Mhz FSB),
                    Xeon E5450(3Ghz, 12M L2 cache, 1333Mhz FSB),
                    Xeon X5560(2.8Ghz, 12M L2 cache, hyper-threads?)
                    Core 2 E8500, Core i7 930(2.8Ghz, hyper-threads enabled),
                    Core i7 950(3.07GHz, hyper-threads enabled)
                    Xeon X5550(2.67GHz, 8M l2 cache?, hyper-threads enabled)


        Libraries tested:
            * numpy with ATLAS from distribution (FC9) package (1 thread)
            * manually compiled numpy and ATLAS with 2 threads
            * goto 1.26 with 1, 2, 4 and 8 threads
            * goto2 1.13 compiled with multiple threads enabled

                          Xeon   Xeon   Xeon  Core2 i7    i7     Xeon   Xeon
        lib/nb threads    E5345  E5430  E5450 E8500 930   950    X5560  X5550

        numpy 1.3.0 blas                                                775.92s
        numpy_FC9_atlas/1 39.2s  35.0s  30.7s 29.6s 21.5s 19.60s
        goto/1            18.7s  16.1s  14.2s 13.7s 16.1s 14.67s
        numpy_MAN_atlas/2 12.0s  11.6s  10.2s  9.2s  9.0s
        goto/2             9.5s   8.1s   7.1s  7.3s  8.1s  7.4s
        goto/4             4.9s   4.4s   3.7s  -     4.1s  3.8s
        goto/8             2.7s   2.4s   2.0s  -     4.1s  3.8s
        openblas/1                                        14.04s
        openblas/2                                         7.16s
        openblas/4                                         3.71s
        openblas/8                                         3.70s
        mkl 11.0.083/1            7.97s
        mkl 10.2.2.025/1                                         13.7s
        mkl 10.2.2.025/2                                          7.6s
        mkl 10.2.2.025/4                                          4.0s
        mkl 10.2.2.025/8                                          2.0s
        goto2 1.13/1                                                     14.37s
        goto2 1.13/2                                                      7.26s
        goto2 1.13/4                                                      3.70s
        goto2 1.13/8                                                      1.94s
        goto2 1.13/16                                                     3.16s

        Test time in float32

        cuda version      5.0    4.2    4.1    4.0    3.2    3.0   # note
        gpu
        K20m/ECC          0.07s
        K20/NOECC         0.07s
        M2070             0.25s         0.27s         0.32s
        M2050(Amazon)     0.25s
        C2075                    0.25s
        C1060                                         0.46s

        GTX Titan(D15U-50)0.06s  don't work
        GTX 680           0.12s  0.154s               0.218s
        GTX 580           0.16s  0.164s               0.203s
        GTX 480           0.19s  0.192s               0.237s 0.27s
        GTX 470           0.23s  0.238s               0.297s 0.34s
        GTX 660           0.20s  0.23s
        GTX 560                  0.30s
        GTX 650 Ti        0.27s
        GTX 460           0.37s                0.45s
        GTX 285                  0.452s        0.452s        0.40s # cuda 3.0 se
 ems faster? driver version?
        GTX 550 Ti                             0.57s
        GT 520                   2.68s                3.06s
        520M              2.44s                       3.19s        # with bumble
 bee on Ubuntu 12.04
        GT 220                                        3.80s
        GT 210                                 6.35s
        8500 GT                                              10.68s

 Some Theano flags:
    blas.ldflags= -LC:\Anaconda\MinGW\lib -lopenblas
    compiledir= R:\compiledir_Windows-7-6.1.7601-SP1-Intel64_Family_6_Model_58_S
 tepping_9_GenuineIntel-2.7.3-64
    floatX= float64
    device= cpu
 Some environment variables:
    MKL_NUM_THREADS= None
    OMP_NUM_THREADS= None
    GOTO_NUM_THREADS= None

 Numpy config: (used when the Theano flag "blas.ldflags" is empty)
 lapack_opt_info:
    libraries = ['mkl_lapack95_lp64', 'mkl_core_dll', 'mkl_intel_lp64_dll', 'mkl
 _intel_thread_dll']
    library_dirs = ['C:\\aroot\\stage\\libs']
    define_macros = [('SCIPY_MKL_H', None)]
    include_dirs = ['C:\\aroot\\stage\\include']
 blas_opt_info:
    libraries = ['mkl_core_dll', 'mkl_intel_lp64_dll', 'mkl_intel_thread_dll']
    library_dirs = ['C:\\aroot\\stage\\libs']
    define_macros = [('SCIPY_MKL_H', None)]
    include_dirs = ['C:\\aroot\\stage\\include']
 lapack_mkl_info:
    libraries = ['mkl_lapack95_lp64', 'mkl_core_dll', 'mkl_intel_lp64_dll', 'mkl
 _intel_thread_dll']
    library_dirs = ['C:\\aroot\\stage\\libs']
    define_macros = [('SCIPY_MKL_H', None)]
    include_dirs = ['C:\\aroot\\stage\\include']
 blas_mkl_info:
    libraries = ['mkl_core_dll', 'mkl_intel_lp64_dll', 'mkl_intel_thread_dll']
    library_dirs = ['C:\\aroot\\stage\\libs']
    define_macros = [('SCIPY_MKL_H', None)]
    include_dirs = ['C:\\aroot\\stage\\include']
 mkl_info:
    libraries = ['mkl_core_dll', 'mkl_intel_lp64_dll', 'mkl_intel_thread_dll']
    library_dirs = ['C:\\aroot\\stage\\libs']
    define_macros = [('SCIPY_MKL_H', None)]
    include_dirs = ['C:\\aroot\\stage\\include']
 Numpy dot module: numpy.core._dotblas
 Numpy location: C:\Anaconda\lib\site-packages\numpy\__init__.pyc
 Numpy version: 1.7.1
 Traceback (most recent call last):
  File "check_blas.py", line 229, in <module>
    iters=options.iter, order=options.order)
  File "check_blas.py", line 71, in execute
    f = theano.function([], updates=[(c, 0.4 * c + .8 * T.dot(a, b))])
  File "D:\OpenCourses\ufldl\ref\Theano\theano\compile\function.py", line 222, i
 n function
    profile=profile)
  File "D:\OpenCourses\ufldl\ref\Theano\theano\compile\pfunc.py", line 506, in p
 func
    on_unused_input=on_unused_input)
  File "D:\OpenCourses\ufldl\ref\Theano\theano\compile\function_module.py", line
 1299, in orig_function
    defaults)
  File "D:\OpenCourses\ufldl\ref\Theano\theano\compile\function_module.py", line
 1168, in create
    _fn, _i, _o = self.linker.make_thunk(input_storage=input_storage_lists)
  File "D:\OpenCourses\ufldl\ref\Theano\theano\gof\link.py", line 382, in make_t
 hunk
    output_storage = output_storage)[:3]
  File "D:\OpenCourses\ufldl\ref\Theano\theano\gof\vm.py", line 840, in make_all

    for node in order]
  File "D:\OpenCourses\ufldl\ref\Theano\theano\gof\op.py", line 589, in make_thu
 nk
    output_storage=node_output_storage)
  File "D:\OpenCourses\ufldl\ref\Theano\theano\gof\cc.py", line 933, in make_thu
 nk
    keep_lock=keep_lock)
  File "D:\OpenCourses\ufldl\ref\Theano\theano\gof\cc.py", line 876, in __compil
 e__
    keep_lock=keep_lock)
  File "D:\OpenCourses\ufldl\ref\Theano\theano\gof\cc.py", line 1304, in cthunk_
 factory
    key=key, fn=self.compile_cmodule_by_step, keep_lock=keep_lock)
  File "D:\OpenCourses\ufldl\ref\Theano\theano\gof\cmodule.py", line 992, in mod
 ule_from_key
    module = next(compile_steps)
  File "D:\OpenCourses\ufldl\ref\Theano\theano\gof\cc.py", line 1221, in compile
 _cmodule_by_step
    preargs=preargs)
  File "D:\OpenCourses\ufldl\ref\Theano\theano\gof\cmodule.py", line 1841, in co
 mpile_str
    return dlimport(lib_filename)
  File "D:\OpenCourses\ufldl\ref\Theano\theano\gof\cmodule.py", line 280, in dli
 mport
    rval = __import__(module_name, {}, {}, [module_name])
 ImportError: ('DLL load failed: \xd5\xd2\xb2\xbb\xb5\xbd\xd6\xb8\xb6\xa8\xb5\xc4
 \xc4\xa3\xbf\xe9\xa1\xa3', '[Gemm{inplace}(<TensorType(float64, matrix)>, Tensor
 Constant{0.8}, <TensorType(float64, matrix)>, <TensorType(float64, matrix)>, Ten
 sorConstant{0.4})]')
	D:\OpenCourses\ufldl\ref\Theano\theano\misc>python check_blas.py
	Forcing DISTUTILS_USE_SDK=1
	WARNING (theano.tensor.blas): Failed to import scipy.linalg.blas. Falling back o
	n slower implementations (DLL load failed: 找不到指定的模块。)

	Some results that you can compare against. They were 10 executions
	of gemm in float64 with matrices of shape 2000x2000 (M=N=K=2000).
	All memory layout was in C order.

	CPU tested: Xeon E5345(2.33Ghz, 8M L2 cache, 1333Mhz FSB),
	Xeon E5430(2.66Ghz, 12M L2 cache, 1333Mhz FSB),
	Xeon E5450(3Ghz, 12M L2 cache, 1333Mhz FSB),
	Xeon X5560(2.8Ghz, 12M L2 cache, hyper-threads?)
	Core 2 E8500, Core i7 930(2.8Ghz, hyper-threads enabled),
	Core i7 950(3.07GHz, hyper-threads enabled)
	Xeon X5550(2.67GHz, 8M l2 cache?, hyper-threads enabled)


	Libraries tested:
	* numpy with ATLAS from distribution (FC9) package (1 thread)
	* manually compiled numpy and ATLAS with 2 threads
	* goto 1.26 with 1, 2, 4 and 8 threads
	* goto2 1.13 compiled with multiple threads enabled

	Xeon Xeon Xeon Core2 i7 i7 Xeon Xeon
	lib/nb threads E5345 E5430 E5450 E8500 930 950 X5560 X5550

	numpy 1.3.0 blas 775.92s
	numpy_FC9_atlas/1 39.2s 35.0s 30.7s 29.6s 21.5s 19.60s
	goto/1 18.7s 16.1s 14.2s 13.7s 16.1s 14.67s
	numpy_MAN_atlas/2 12.0s 11.6s 10.2s 9.2s 9.0s
	goto/2 9.5s 8.1s 7.1s 7.3s 8.1s 7.4s
	goto/4 4.9s 4.4s 3.7s - 4.1s 3.8s
	goto/8 2.7s 2.4s 2.0s - 4.1s 3.8s
	openblas/1 14.04s
	openblas/2 7.16s
	openblas/4 3.71s
	openblas/8 3.70s
	mkl 11.0.083/1 7.97s
	mkl 10.2.2.025/1 13.7s
	mkl 10.2.2.025/2 7.6s
	mkl 10.2.2.025/4 4.0s
	mkl 10.2.2.025/8 2.0s
	goto2 1.13/1 14.37s
	goto2 1.13/2 7.26s
	goto2 1.13/4 3.70s
	goto2 1.13/8 1.94s
	goto2 1.13/16 3.16s

	Test time in float32

	cuda version 5.0 4.2 4.1 4.0 3.2 3.0 # note
	gpu
	K20m/ECC 0.07s
	K20/NOECC 0.07s
	M2070 0.25s 0.27s 0.32s
	M2050(Amazon) 0.25s
	C2075 0.25s
	C1060 0.46s

	GTX Titan(D15U-50)0.06s don't work
	GTX 680 0.12s 0.154s 0.218s
	GTX 580 0.16s 0.164s 0.203s
	GTX 480 0.19s 0.192s 0.237s 0.27s
	GTX 470 0.23s 0.238s 0.297s 0.34s
	GTX 660 0.20s 0.23s
	GTX 560 0.30s
	GTX 650 Ti 0.27s
	GTX 460 0.37s 0.45s
	GTX 285 0.452s 0.452s 0.40s # cuda 3.0 se
	ems faster? driver version?
	GTX 550 Ti 0.57s
	GT 520 2.68s 3.06s
	520M 2.44s 3.19s # with bumble
	bee on Ubuntu 12.04
	GT 220 3.80s
	GT 210 6.35s
	8500 GT 10.68s

	Some Theano flags:
	blas.ldflags= -LC:\Anaconda\MinGW\lib -lopenblas
	compiledir= R:\compiledir_Windows-7-6.1.7601-SP1-Intel64_Family_6_Model_58_S
	tepping_9_GenuineIntel-2.7.3-64
	floatX= float64
	device= cpu
	Some environment variables:
	MKL_NUM_THREADS= None
	OMP_NUM_THREADS= None
	GOTO_NUM_THREADS= None

	Numpy config: (used when the Theano flag "blas.ldflags" is empty)
	lapack_opt_info:
	libraries = ['mkl_lapack95_lp64', 'mkl_core_dll', 'mkl_intel_lp64_dll', 'mkl
	_intel_thread_dll']
	library_dirs = ['C:\\aroot\\stage\\libs']
	define_macros = [('SCIPY_MKL_H', None)]
	include_dirs = ['C:\\aroot\\stage\\include']
	blas_opt_info:
	libraries = ['mkl_core_dll', 'mkl_intel_lp64_dll', 'mkl_intel_thread_dll']
	library_dirs = ['C:\\aroot\\stage\\libs']
	define_macros = [('SCIPY_MKL_H', None)]
	include_dirs = ['C:\\aroot\\stage\\include']
	lapack_mkl_info:
	libraries = ['mkl_lapack95_lp64', 'mkl_core_dll', 'mkl_intel_lp64_dll', 'mkl
	_intel_thread_dll']
	library_dirs = ['C:\\aroot\\stage\\libs']
	define_macros = [('SCIPY_MKL_H', None)]
	include_dirs = ['C:\\aroot\\stage\\include']
	blas_mkl_info:
	libraries = ['mkl_core_dll', 'mkl_intel_lp64_dll', 'mkl_intel_thread_dll']
	library_dirs = ['C:\\aroot\\stage\\libs']
	define_macros = [('SCIPY_MKL_H', None)]
	include_dirs = ['C:\\aroot\\stage\\include']
	mkl_info:
	libraries = ['mkl_core_dll', 'mkl_intel_lp64_dll', 'mkl_intel_thread_dll']
	library_dirs = ['C:\\aroot\\stage\\libs']
	define_macros = [('SCIPY_MKL_H', None)]
	include_dirs = ['C:\\aroot\\stage\\include']
	Numpy dot module: numpy.core._dotblas
	Numpy location: C:\Anaconda\lib\site-packages\numpy\__init__.pyc
	Numpy version: 1.7.1
	Traceback (most recent call last):
	File "check_blas.py", line 229, in <module>
	iters=options.iter, order=options.order)
	File "check_blas.py", line 71, in execute
	f = theano.function([], updates=[(c, 0.4 * c + .8 * T.dot(a, b))])
	File "D:\OpenCourses\ufldl\ref\Theano\theano\compile\function.py", line 222, i
	n function
	profile=profile)
	File "D:\OpenCourses\ufldl\ref\Theano\theano\compile\pfunc.py", line 506, in p
	func
	on_unused_input=on_unused_input)
	File "D:\OpenCourses\ufldl\ref\Theano\theano\compile\function_module.py", line
	1299, in orig_function
	defaults)
	File "D:\OpenCourses\ufldl\ref\Theano\theano\compile\function_module.py", line
	1168, in create
	_fn, _i, _o = self.linker.make_thunk(input_storage=input_storage_lists)
	File "D:\OpenCourses\ufldl\ref\Theano\theano\gof\link.py", line 382, in make_t
	hunk
	output_storage = output_storage)[:3]
	File "D:\OpenCourses\ufldl\ref\Theano\theano\gof\vm.py", line 840, in make_all

	for node in order]
	File "D:\OpenCourses\ufldl\ref\Theano\theano\gof\op.py", line 589, in make_thu
	nk
	output_storage=node_output_storage)
	File "D:\OpenCourses\ufldl\ref\Theano\theano\gof\cc.py", line 933, in make_thu
	nk
	keep_lock=keep_lock)
	File "D:\OpenCourses\ufldl\ref\Theano\theano\gof\cc.py", line 876, in __compil
	e__
	keep_lock=keep_lock)
	File "D:\OpenCourses\ufldl\ref\Theano\theano\gof\cc.py", line 1304, in cthunk_
	factory
	key=key, fn=self.compile_cmodule_by_step, keep_lock=keep_lock)
	File "D:\OpenCourses\ufldl\ref\Theano\theano\gof\cmodule.py", line 992, in mod
	ule_from_key
	module = next(compile_steps)
	File "D:\OpenCourses\ufldl\ref\Theano\theano\gof\cc.py", line 1221, in compile
	_cmodule_by_step
	preargs=preargs)
	File "D:\OpenCourses\ufldl\ref\Theano\theano\gof\cmodule.py", line 1841, in co
	mpile_str
	return dlimport(lib_filename)
	File "D:\OpenCourses\ufldl\ref\Theano\theano\gof\cmodule.py", line 280, in dli
	mport
	rval = __import__(module_name, {}, {}, [module_name])
	ImportError: ('DLL load failed: \xd5\xd2\xb2\xbb\xb5\xbd\xd6\xb8\xb6\xa8\xb5\xc4
	\xc4\xa3\xbf\xe9\xa1\xa3', '[Gemm{inplace}(<TensorType(float64, matrix)>, Tensor
	Constant{0.8}, <TensorType(float64, matrix)>, <TensorType(float64, matrix)>, Ten
	sorConstant{0.4})]')