Merge pull request #2386 from abergeron/multi_fixes2

Multi-GPU scan fixes

Merge pull request #2386 from abergeron/multi_fixes2
65ac8e8a · Pascal Lamblin · f0ea3819 · d293b5e9 · 65ac8e8a · 65ac8e8a
--- a/theano/sandbox/cuda/opt.py
+++ b/theano/sandbox/cuda/opt.py
@@ -2000,12 +2000,6 @@ def local_gpu_extract_diagonal(node):
                gpu_from_host(diag_node.inputs[0]))]
    return False

-def typeConstructor(broadcastable, dtype):
-    if dtype == 'float32':
-        return CudaNdarrayType(broadcastable=broadcastable)
-    else:
-        return tensor.TensorType(broadcastable=broadcastable, dtype=dtype)
-
 @register_opt('scan')
 @local_optimizer([gpu_from_host, scan_op.Scan])
 def gpuScanOptimization(node):
@@ -2065,9 +2059,7 @@ def gpuScanOptimization(node):

            nw_op = scan_op.Scan(scan_ins,
                                 scan_outs,
-                                 info,
-                                 typeConstructor=typeConstructor).make_node(
-                                     *nw_ins)
+                                 info).make_node(*nw_ins)
            _outputs = nw_op.outputs
            return _outputs

@@ -2113,8 +2105,7 @@ def gpuScanOptimization(node):
            _outputs = scan_op.Scan(
                scan_ins,
                scan_outs,
-                info,
-                typeConstructor=typeConstructor).make_node(*nw_ins).outputs
+                info).make_node(*nw_ins).outputs
            outputs = []
            for x, y in zip(_outputs, node.outputs):
                if isinstance(y.type, CudaNdarrayType):
@@ -2126,8 +2117,7 @@ def gpuScanOptimization(node):


 optdb.register('gpu_scanOp_make_inplace',
-               scan_opt.ScanInplaceOptimizer(typeConstructor=typeConstructor,
-                                             gpu_flag=True),
+               scan_opt.ScanInplaceOptimizer(gpu_flag=True),
               75,
               'gpu',
               'fast_run',

--- a/theano/sandbox/gpuarray/opt.py
+++ b/theano/sandbox/gpuarray/opt.py
@@ -716,13 +716,11 @@ def local_scan_to_gpua(node):
    _cmodule_key = gof.CLinker().cmodule_key_(local_fgraph, [])
    info['gpu_hash'] = hash(_cmodule_key)

-    nw_op = scan_op.Scan(scan_ins, scan_outs, info,
-                         typeConstructor=GpuArrayType).make_node(*nw_ins)
+    nw_op = scan_op.Scan(scan_ins, scan_outs, info).make_node(*nw_ins)
    return nw_op.outputs

 optdb.register('gpua_scanOp_make_inplace',
-               scan_opt.ScanInplaceOptimizer(typeConstructor=GpuArrayType,
-                                             gpua_flag=True),
+               scan_opt.ScanInplaceOptimizer(gpua_flag=True),
               75,
               'gpua',
               'fast_run',

--- a/theano/sandbox/gpuarray/tests/test_scan.py
+++ b/theano/sandbox/gpuarray/tests/test_scan.py
@@ -15,6 +15,7 @@ from theano.sandbox.gpuarray.tests.test_basic_ops import mode_with_gpu
 class T_Scan(TestCase):
    def setUp(self):
        utt.seed_rng()
+        super(T_Scan, self).setUp()

    def test_one_sequence_one_output_weights_gpu1(self):
        def f_rnn(u_t, x_tm1, W_in, W):

--- a/theano/scan_module/scan.py
+++ b/theano/scan_module/scan.py
@@ -594,7 +594,9 @@ def scan(fn,
        if init_out.get('taps', None) == [-1]:

            actual_arg = init_out['initial']
-            arg = safe_new(init_out['initial'])
+            if not isinstance(actual_arg, tensor.Variable):
+                actual_arg = tensor.as_tensor_variable(actual_arg)
+            arg = safe_new(actual_arg)
            if isinstance(arg, tensor.Constant):
                # safe new returns a clone of the constants, but that is not
                # what we need for initial states

--- a/theano/scan_module/scan_op.py
+++ b/theano/scan_module/scan_op.py
@@ -49,7 +49,6 @@ class Scan(PureOp):
                 inputs,
                 outputs,
                 info,
-                 typeConstructor=None,
                ):
        """
        :param inputs: inputs of the inner function of scan
@@ -58,21 +57,6 @@ class Scan(PureOp):
            the scan op (like number of different types of
            arguments, name, mode, if it should run on GPU or
            not, etc.)
-        :param typeConstructor: function that constructs an equivalent
-            to Theano TensorType
-
-
-        Note: ``typeConstructor`` had been added to refactor how
-        Theano deals with the GPU. If it runs on the GPU, scan needs
-        to construct certain outputs (those who reside in the GPU
-        memory) as the GPU-specific type.  However we can not import
-        gpu code in this file (as it is in sandbox, and not available
-        on each machine) so the workaround is that the GPU
-        optimization passes to the constructor of this class a
-        function that is able to construct a GPU type. This way the
-        class Scan does not need to be aware of the details for the
-        GPU, it just constructs any tensor using this function (which
-        by default constructs normal tensors).
        """
        if 'gpua' not in info:
            info['gpua'] = False
@@ -88,19 +72,13 @@ class Scan(PureOp):
        self.output_types = []
        idx = 0
        jdx = 0
-        tensorConstructor = lambda broadcastable, dtype: TensorType(
-            broadcastable=broadcastable, dtype=dtype)
-        if typeConstructor is None:
-            typeConstructor = tensorConstructor

        while idx < self.n_mit_mot_outs:
            # Not that for mit_mot there are several output slices per
            # output sequence
            o = outputs[idx]
            self.output_types.append(
-                typeConstructor(
-                    broadcastable=(False,) + o.type.broadcastable,
-                    dtype=o.type.dtype))
+                o.type.clone(broadcastable=(False,) + o.type.broadcastable))

            idx += len(self.mit_mot_out_slices[jdx])
            jdx += 1
@@ -110,9 +88,7 @@ class Scan(PureOp):

        for o in outputs[idx:end]:
            self.output_types.append(
-                typeConstructor(
-                    broadcastable=(False,) + o.type.broadcastable,
-                    dtype=o.type.dtype))
+                o.type.clone(broadcastable=(False,) + o.type.broadcastable))

        # shared outputs + possibly the ending condition
        for o in outputs[end:]:
@@ -241,10 +217,9 @@ class Scan(PureOp):
            if rval.ndim == as_var.ndim:
                rval = as_var.type.filter_variable(rval)
            else:
-                tmp = as_var.type.__class__(
-                    broadcastable=tuple(var.broadcastable[:1])+\
-                                  tuple(as_var.broadcastable),
-                    dtype=as_var.dtype)
+                tmp = as_var.type.clone(
+                    broadcastable=(tuple(var.broadcastable[:1]) +
+                                   tuple(as_var.broadcastable)))
                rval = tmp.filter_variable(rval)
            return rval

@@ -517,11 +492,11 @@ class Scan(PureOp):
        return aux_txt

    def __hash__(self):
-        return (hash(type(self)) ^
-                # and a hash representing the inner graph using the
-                # CLinker.cmodule_key_
-                self._hash_inner_graph ^
-                scan_utils.hash_listsDictsTuples(self.info))
+        return hash((type(self),
+                     # and a hash representing the inner graph using the
+                     # CLinker.cmodule_key_
+                     self._hash_inner_graph,
+                     scan_utils.hash_listsDictsTuples(self.info)))

    def make_thunk(self, node, storage_map, compute_map, no_recycling):
        """

--- a/theano/scan_module/scan_opt.py
+++ b/theano/scan_module/scan_opt.py
@@ -916,9 +916,8 @@ class PushOutScanOutput(gof.Optimizer):

 class ScanInplaceOptimizer(Optimizer):
    """Graph optimizer for Scan(makes it run inplace)"""
-    def __init__(self, typeConstructor=None, gpu_flag=False, gpua_flag=False):
+    def __init__(self, gpu_flag=False, gpua_flag=False):
        Optimizer.__init__(self)
-        self.typeConstructor = typeConstructor
        self.gpu_flag = gpu_flag
        self.gpua_flag = gpua_flag

@@ -960,8 +959,7 @@ class ScanInplaceOptimizer(Optimizer):
                inputs = ls_begin + ls + ls_end
                new_op = scan_op.Scan(op.inputs,
                                      op.outputs,
-                                      info,
-                                      typeConstructor=self.typeConstructor)
+                                      info)

                # Do not call make_node for test_value
                new_outs = new_op(*inputs, **dict(return_list=True))
@@ -2087,8 +2085,7 @@ scan_eqopt2 = theano.gof.EquilibriumDB()
 optdb.register('scan_eqopt1', scan_eqopt1, .1, 'fast_run', 'scan')
 optdb.register('scan_eqopt2', scan_eqopt2, 1.6, 'fast_run', 'scan')
 optdb.register('scanOp_make_inplace',
-               ScanInplaceOptimizer(typeConstructor=None,
-                                    gpu_flag=False),
+               ScanInplaceOptimizer(),
               75,
               'fast_run',
               'inplace',

--- a/theano/scan_module/scan_perform.c
+++ b/theano/scan_module/scan_perform.c
--- a/theano/scan_module/scan_perform.pyx
+++ b/theano/scan_module/scan_perform.pyx
@@ -62,7 +62,7 @@ import copy


 def get_version():
-    return 0.283
+    return 0.284

 @cython.boundscheck(False)
 def perform(

--- a/theano/scan_module/scan_perform_ext.py
+++ b/theano/scan_module/scan_perform_ext.py
@@ -16,7 +16,7 @@ from theano.gof import cmodule
 _logger = logging.getLogger('theano.scan_module.scan_perform')


-version = 0.283  # must match constant returned in function get_version()
+version = 0.284  # must match constant returned in function get_version()

 need_reload = False


--- a/theano/scan_module/scan_utils.py
+++ b/theano/scan_module/scan_utils.py
@@ -46,6 +46,7 @@ def safe_new(x, tag='', dtype=None):
        nw_name = x.name + tag
    else:
        nw_name = None
+
    if isinstance(x, theano.Constant):
        if dtype and x.dtype != dtype:
            casted_x = x.astype(dtype)
@@ -54,28 +55,14 @@ def safe_new(x, tag='', dtype=None):
            return nwx
        else:
            return x.clone()
-    # Note, as_tensor_variable will convert the Scalar into a
-    # TensorScalar that will require a ScalarFromTensor op,
-    # making the pushout optimization fail
-    elif isinstance(x, scalar.ScalarVariable):
-        if dtype:
-            nw_x = scalar.get_scalar_type(dtype=dtype)()
-        else:
-            nw_x = x.type()
-        nw_x.name = nw_name
-        return nw_x
-    else:
-        try:
-            x = tensor.as_tensor_variable(x)
-        except TypeError:
-            # This could happen for example for random states, and I really
-            # want to avoid the convoluted logic that checks for cuda
-            # ndarrays
-            pass
+
+    # at this point we should only have Variables
+    assert isinstance(x, theano.Variable)
    nw_x = x.type()
    if dtype and nw_x.dtype != dtype:
        nw_x = nw_x.astype(dtype).type()
    nw_x.name = nw_name
+
    # Preserve test values so that the 'compute_test_value' option can be used.
    # The test value is deep-copied to ensure there can be no interactions
    # between test values, due to inplace operations for instance. This may
@@ -815,7 +802,7 @@ class scan_args(object):
    def __init__(self, outer_inputs, outer_outputs,
                 _inner_inputs, _inner_outputs, info):
        self.n_steps = outer_inputs[0]
-        rval = reconstruct_graph(_inner_inputs, _inner_outputs, '_merge')
+        rval = reconstruct_graph(_inner_inputs, _inner_outputs, '')
        if info['as_while']:
            self.cond = [rval[1][-1]]
            inner_outputs = rval[1][:-1]
@@ -919,6 +906,9 @@ class scan_args(object):
        p += n_shared_outs
        q += n_shared_outs

+        assert p == len(outer_outputs)
+        assert q == len(inner_outputs)
+
        self.other_info = OrderedDict()
        for k in ('truncate_gradient', 'name', 'mode', 'destroy_map',
                  'gpu', 'gpua', 'as_while', 'profile', 'allow_gc'):