提交 c058326d authored 作者: Brandon T. Willard's avatar Brandon T. Willard 提交者: Brandon T. Willard

Replace use of T with aet

上级 4a8ccb6d
......@@ -25,9 +25,10 @@ class Assert(COp):
--------
>>> import aesara
>>> import aesara.tensor as aet
>>> x = aet.vector('x')
>>> assert_op = aet.opt.Assert()
>>> func = aesara.function([x], assert_op(x, x.size<2))
>>> from aesara.assert_op import Assert
>>> x = aet.vector("x")
>>> assert_op = Assert("This assert failed")
>>> func = aesara.function([x], assert_op(x, x.size < 2))
"""
......
......@@ -3379,10 +3379,10 @@ def dnn_batch_normalization_train(
axes = 0 if mode == 'per-activation' else (0, 2, 3)
mean = inputs.mean(axes, keepdims=True)
var = inputs.var(axes, keepdims=True)
invstd = T.inv(T.sqrt(var + epsilon))
invstd = aet.inv(aet.sqrt(var + epsilon))
out = (inputs - mean) * gamma * invstd + beta
m = T.cast(T.prod(inputs.shape) / T.prod(mean.shape), 'float32')
m = aet.cast(aet.prod(inputs.shape) / aet.prod(mean.shape), 'float32')
running_mean = running_mean * (1 - running_average_factor) + \\
mean * running_average_factor
running_var = running_var * (1 - running_average_factor) + \\
......@@ -3511,9 +3511,9 @@ def dnn_batch_normalization_test(
.. code-block:: python
axes = (0,) if mode == 'per-activation' else (0, 2, 3)
gamma, beta, mean, var = (T.addbroadcast(t, *axes)
gamma, beta, mean, var = (aet.addbroadcast(t, *axes)
for t in (gamma, beta, mean, var))
out = (inputs - mean) * gamma / T.sqrt(var + epsilon) + beta
out = (inputs - mean) * gamma / aet.sqrt(var + epsilon) + beta
For 5d tensors, the axes would be (0, 2, 3, 4).
"""
......
......@@ -3420,7 +3420,7 @@ class _nd_grid:
Examples
--------
>>> a = T.mgrid[0:5, 0:3]
>>> a = aet.mgrid[0:5, 0:3]
>>> a[0].eval()
array([[0, 0, 0],
[1, 1, 1],
......@@ -3433,7 +3433,7 @@ class _nd_grid:
[0, 1, 2],
[0, 1, 2],
[0, 1, 2]], dtype=int8)
>>> b = T.ogrid[0:5, 0:3]
>>> b = aet.ogrid[0:5, 0:3]
>>> b[0].eval()
array([[0],
[1],
......@@ -3853,45 +3853,28 @@ def diagonal(a, offset=0, axis1=0, axis2=1):
class AllocDiag(Op):
"""
An op that copies a vector to the diagonal of an empty matrix. It does the
inverse of ExtractDiag.
Usage: T.AllocDiag()(x)
`x` should be a tensor vector. The parenthesis in the front should indicate
which main diagonal the vector value goes into. By default it is set to
`0`, which corresponds to setting the values of x to the main diagonal in
the returned matrix.
Parameters
----------
axis1: Axis to be used as the first axis of the 2-D
sub-arrays to which the diagonals will be allocated.
Defaults to first axis (0).
axis2: Axis to be used as the second axis of the 2-D
sub-arrays to which the diagonals will be allocated.
Defaults to second axis (1).
offset: Offset of the diagonal from the main diagonal defined by `axis1`
and `axis2`.
Can be positive or negative.
Defaults to main diagonal (0).
x: symbolic vector
A tensor vector consists of diagonal values.
Returns
-------
tensor : symbolic tenstor
A tensor with passed tensor values at their corresponding diagonals.
"""An `Op` that copies a vector to the diagonal of an empty matrix.
It does the inverse of `ExtractDiag`.
"""
__props__ = ("offset", "axis1", "axis2")
def __init__(self, offset=0, axis1=0, axis2=1):
"""
Parameters
----------
offset: int
Offset of the diagonal from the main diagonal defined by `axis1`
and `axis2`. Can be positive or negative. Defaults to main
diagonal (i.e. 0).
axis1: int
Axis to be used as the first axis of the 2-D sub-arrays to which
the diagonals will be allocated. Defaults to first axis (i.e. 0).
axis2: int
Axis to be used as the second axis of the 2-D sub-arrays to which
the diagonals will be allocated. Defaults to second axis (i.e. 1).
"""
self.offset = offset
self.axis1 = axis1
self.axis2 = axis2
......
......@@ -810,7 +810,7 @@ class ShapeFeature(toolbox.Feature):
2. to infer the shape of every node in the graph in terms of the
input shapes.
3. remove all fills (T.second, T.fill) from the graph
3. remove all fills ``(aet.second, aet.fill)`` from the graph
Lifting shapes as close to the inputs as possible is important for
canonicalization because it is very bad form to have to compute
......@@ -2236,12 +2236,12 @@ def local_alloc_unary(fgraph, node):
x = a.owner.inputs[0]
shp = a.owner.inputs[1:]
v = node.op(x)
# T.alloc does not preserve the stacktrace of v,
# aet.alloc does not preserve the stacktrace of v,
# so we need to copy it over from x.
copy_stack_trace(node.outputs[0], v)
ret = alloc(cast(v, node.outputs[0].dtype), *shp)
# T.cast does not preserve the stacktrace of x,
# aet.cast does not preserve the stacktrace of x,
# so we need to copy it over to the output.
copy_stack_trace([node.outputs[0], a], ret)
return [ret]
......@@ -3132,14 +3132,11 @@ def local_subtensor_of_alloc(fgraph, node):
@register_specialize
@local_optimizer([Subtensor])
def local_subtensor_of_dot(fgraph, node):
"""
This optimization translates T.dot(A, B)[idxs] into T.dot(A[idxs_a], B[idxs_b]),
where idxs_a and idxs_b are defined appropriately.
"""Rewrite ``aet.dot(A, B)[idxs]`` into ``aet.dot(A[idxs_a], B[idxs_b])``.
idxs_a is the first A.ndim-1 entries of idxs,
and idxs_b is the remaining entries of idxs (if any),
modified to skip the second-to-last dimension of B
(because dot sums over this dimension).
``idxs_a`` is the first ``A.ndim-1`` entries of ``idxs``, and ``idxs_b`` is
the remaining entries of ``idxs`` (if any), modified to skip the
second-to-last dimension of ``B`` (because dot sums over this dimension).
"""
if not isinstance(node.op, Subtensor):
......@@ -3535,7 +3532,7 @@ def local_useless_inc_subtensor_alloc(fgraph, node):
i = node.inputs[2:]
if y.owner is not None and isinstance(y.owner.op, Alloc):
# `z` is the input of the Alloc op, i.e. T.alloc(z, <shape>)
# `z` is the input of the Alloc op, i.e. aet.alloc(z, <shape>)
z = y.owner.inputs[0]
try:
......@@ -3803,7 +3800,7 @@ def local_join_empty(fgraph, node):
new_inputs.append(inp)
if len(new_inputs) < len(node.inputs) - 1:
if len(new_inputs) == 0:
# T.join do not work in that case.
# aet.join do not work in that case.
# constant folding will take care of this case.
return
ret = join(node.inputs[0], *new_inputs)
......@@ -3880,12 +3877,16 @@ def local_join_make_vector(fgraph, node):
def local_useless_switch(fgraph, node):
"""
This optimization makes the following changes in the graph:
T.switch(cond,left,right) -->
if cond is constant and cond == 0: right
if cond is constant and cond != 0: left
if left is right -> left
T.switch(le(shape_i{id}(X), 0), 0, shape_i{id}(X)) -> shape_i{id}(X)
``aet.switch(cond, left, right)`` ->
``if cond is constant and cond == 0``: right
``if cond is constant and cond != 0``: left
``if left is right`` -> ``left``
and
``aet.switch(le(shape_i{id}(X), 0), 0, shape_i{id}(X))`` -> ``shape_i{id}(X)``
"""
if isinstance(node.op, Elemwise) and isinstance(node.op.scalar_op, aes.Switch):
......
......@@ -1111,7 +1111,7 @@ def res_is_a(fgraph, var, op, maxclients=None):
def _as_scalar(res, dtype=None):
"""Return None or a TensorVariable whose type is in T.float_scalar_types"""
"""Return ``None`` or a `TensorVariable` whose type is in `float_scalar_types`"""
if dtype is None:
dtype = config.floatX
if np.all(res.type.broadcastable):
......
......@@ -2490,14 +2490,14 @@ class Prod(CAReduceDtype):
Implementing that case-by-case logic is not as trivial, so a bunch of
hacks are piled down here to do it. Notably, for the "only one zero"
case, there's a special Op that computes the product of the elements
in the group, minus the zero (see ProdWithoutZero). The trick is then
in the group, minus the zero (see `ProdWithoutZeros`). The trick is then
to use the division trick for groups with no zero, to use the
ProdWithoutZeros op where there's only one zero, and to output a
`ProdWithoutZeros` op where there's only one zero, and to output a
derivative of zero for any element part of a group with more than
one zero.
I do this by first counting the number of zeros in each group (see
the "T.eq()" bits), then taking this or that behavior (see T.switch)
I do this by first counting the number of zeros in each group (see the
`aet.eq` bits), then taking this or that behavior (see `aet.switch`)
based on the result of this count.
"""
......@@ -2532,7 +2532,7 @@ class Prod(CAReduceDtype):
gz = gz.dimshuffle(new_dims)
# division trick if we don't have zeros. This will contain
# NaNs to be eliminated in the T.switch if we do have zeros.
# NaNs to be eliminated in the `aet.switch` if we do have zeros.
grad_case_without_zeros = gz * prod_out / prod_in
if self.no_zeros_in_input:
......
......@@ -148,8 +148,7 @@ def local_0_dot_x(fgraph, node):
@register_canonicalize
@local_optimizer([DimShuffle])
def local_lift_transpose_through_dot(fgraph, node):
"""
dot(x,y).T -> dot(y.T, x.T)
"""Perform the rewrite ``dot(x,y).T -> dot(y.T, x.T)``
These optimizations "lift" (propagate towards the inputs) DimShuffle
through dot product. It allows to put the graph in a more standard shape,
......@@ -231,8 +230,9 @@ def local_func_inv(fgraph, node):
@local_optimizer([Sum])
def local_sumsqr2dot(fgraph, node):
"""
This optimization detects T.sqr( W.dimshuffle('x',0,1) * G.dimshuffle(0,'x',1) ).sum(axis=(1,2))
and converts this to T.dot(T.sqr(G), T.sqr(W).sum(axis=0)).
This optimization detects
``aet.sqr(W.dimshuffle("x", 0, 1) * G.dimshuffle(0, "x", 1) ).sum(axis=(1, 2))``
and converts it to ``aet.dot(aet.sqr(G), aet.sqr(W).sum(axis=0))``.
"""
if (
isinstance(node.op, Sum)
......@@ -305,24 +305,30 @@ def local_expm1(fgraph, node):
def local_mul_switch_sink(fgraph, node):
"""
This optimization makes the following changes in the graph:
T.mul(A,T.switch(cond,0,iff),B) --> T.switch(cond,0,T.mul(A,B,iff))
T.mul(A,T.switch(cond,ift,0),B) --> T.switch(cond,T.mul(A,B,ift),0)
A and B being several (or none) symbolic variables.
This is useful because A and B may not be numerically stable and give
``aet.mul(A, aet.switch(cond, 0, iff), B)`` -> ``aet.switch(cond, 0, aet.mul(A, B, iff))``
``aet.mul(A, aet.switch(cond, ift, 0), B)`` -> ``aet.switch(cond, aet.mul(A, B, ift), 0)``
``A`` and ``B`` being several (or none) symbolic variables.
This is useful because ``A`` and ``B`` may not be numerically stable and give
NaN or inf values for cases where the switch returns 0.
With this optimization T.grad(T.switch(...)) has the right behavior.
With this optimization ``aet.grad(aet.switch(...))`` has the right behavior.
Examples
--------
x -> f(x)
x -> g(x)
y = T.switch(cond,f(x),g(x))
**without the optimization
T.grad(y,x) -> grad(f(x),x) * grad(y,f(x)) + grad(g(x),x) * grad(y,g(x))
**with the optimization
T.grad(y,x) -> switch(cond,grad(f(x),x), 0) + switch(cond,0,grad(g(x),x))
This will be particularly useful for the lazyif because we skip
an entire part of the graph.
x -> f(x)
x -> g(x)
y = aet.switch(cond, f(x), g(x))
without the optimization:
aet.grad(y, x) -> grad(f(x), x) * grad(y, f(x)) + grad(g(x), x) * grad(y, g(x))
with the optimization
aet.grad(y, x) -> switch(cond, grad(f(x), x), 0) + switch(cond, 0, grad(g(x), x))
This will be particularly useful for the lazy ``if`` because we skip an entire
part of the graph.
"""
if node.op != mul:
......@@ -393,13 +399,16 @@ def local_mul_switch_sink(fgraph, node):
def local_div_switch_sink(fgraph, node):
"""
This optimization makes the following changes in the graph:
T.div(T.switch(cond,0,iff),A) --> T.switch(cond,0,T.div(iff,A))
T.div(T.switch(cond,ift,0),A) --> T.switch(cond,T.div(ift,A),0)
A being a symbolic variable.
This is useful because A may not be numerically stable and give
NaN or inf values for cases where the switch returns 0.
See local_mul_switch_sink for more details.
``aet.div(aet.switch(cond, 0, iff), A)`` -> ``aet.switch(cond, 0, aet.div(iff, A))``
``aet.div(aet.switch(cond, ift, 0), A)`` -> ``aet.switch(cond, aet.div(ift, A), 0)``
where ``A`` is a symbolic variable.
This is useful because ``A`` may not be numerically stable and give
``nan`` or ``inf`` values for cases where the switch returns 0.
See `local_mul_switch_sink` for more details.
"""
if node.op != true_div and node.op != int_div:
......@@ -1027,9 +1036,8 @@ def local_sum_prod_mul_by_scalar(fgraph, node):
# for same reason as above.
copy_stack_trace(node.outputs, new_op_output)
# If node.op is a T.elemwise.Prod, then the scalars need to be
# raised to the power of the number of elements in the input
# to the Prod
# If `node.op` is a `Prod`, then the scalars need to be raised to
# the power of the number of elements in the input to the `Prod`
if isinstance(node.op, Prod) and new_op_input_nb_elements != 1:
scalars = [s ** new_op_input_nb_elements for s in scalars]
......
......@@ -17,6 +17,7 @@ import warnings
import numpy as np
import aesara
from aesara import tensor as aet
from aesara.assert_op import Assert
from aesara.configdefaults import config
from aesara.graph.basic import Apply, Variable
......@@ -560,12 +561,12 @@ def assert_conv_shape(shape):
assert_shp = Assert(
f"The convolution would produce an invalid shape (dim[{int(i)}] < 0)."
)
out_shape.append(assert_shp(n, aesara.tensor.ge(n, 0)))
out_shape.append(assert_shp(n, aet.ge(n, 0)))
else:
assert_shp = Assert(
f"The convolution would produce an invalid shape (dim[{int(i)}] < 0)."
)
out_shape.append(assert_shp(n, aesara.tensor.gt(n, 0)))
out_shape.append(assert_shp(n, aet.gt(n, 0)))
return tuple(out_shape)
......@@ -597,7 +598,7 @@ def assert_shape(x, expected_shape, msg="Unexpected shape."):
tests = []
for i in range(x.ndim):
if expected_shape[i] is not None:
tests.append(aesara.tensor.eq(shape[i], expected_shape[i]))
tests.append(aet.eq(shape[i], expected_shape[i]))
if tests:
return Assert(msg)(x, *tests)
else:
......@@ -1862,13 +1863,11 @@ def bilinear_kernel_1D(ratio, normalize=True):
by the indicated ratio using bilinear interpolation in one dimension.
"""
T = aesara.tensor
half_kern = T.arange(1, ratio + 1, dtype=config.floatX)
kern = T.concatenate([half_kern, half_kern[-2::-1]])
half_kern = aet.arange(1, ratio + 1, dtype=config.floatX)
kern = aet.concatenate([half_kern, half_kern[-2::-1]])
if normalize:
kern /= T.cast(ratio, config.floatX)
kern /= aet.cast(ratio, config.floatX)
return kern
......@@ -1903,7 +1902,6 @@ def frac_bilinear_upsampling(input, frac_ratio):
sides. This does not happen when it is odd.
"""
T = aesara.tensor
row, col = input.shape[2:]
up_input = input.reshape((-1, 1, row, col))
......@@ -1928,15 +1926,15 @@ def frac_bilinear_upsampling(input, frac_ratio):
subsample = (frac_ratio[1], frac_ratio[1])
# duplicate borders of the input
concat_mat = T.concatenate(
concat_mat = aet.concatenate(
(up_input[:, :, :1, :], up_input, up_input[:, :, -1:, :]), axis=2
)
concat_mat = T.concatenate(
concat_mat = aet.concatenate(
(concat_mat[:, :, :, :1], concat_mat, concat_mat[:, :, :, -1:]), axis=3
)
# add padding for the pyramidal kernel
double_pad = (2 * T.as_tensor([row, col]) - 1) * np.array(ratio) + 1
double_pad = (2 * aet.as_tensor([row, col]) - 1) * np.array(ratio) + 1
pad = double_pad // 2
# build pyramidal kernel
......@@ -1945,25 +1943,25 @@ def frac_bilinear_upsampling(input, frac_ratio):
)
# add corresponding padding
pad_kern = T.concatenate(
pad_kern = aet.concatenate(
(
T.zeros(
aet.zeros(
tuple(kern.shape[:2]) + (pad[0], kern.shape[-1]),
dtype=config.floatX,
),
kern,
T.zeros(
aet.zeros(
tuple(kern.shape[:2]) + (double_pad[0] - pad[0], kern.shape[-1]),
dtype=config.floatX,
),
),
axis=2,
)
pad_kern = T.concatenate(
pad_kern = aet.concatenate(
(
T.zeros(tuple(pad_kern.shape[:3]) + (pad[1],), dtype=config.floatX),
aet.zeros(tuple(pad_kern.shape[:3]) + (pad[1],), dtype=config.floatX),
pad_kern,
T.zeros(
aet.zeros(
tuple(pad_kern.shape[:3]) + (double_pad[1] - pad[1],),
dtype=config.floatX,
),
......@@ -1972,7 +1970,7 @@ def frac_bilinear_upsampling(input, frac_ratio):
)
# upsample the input by passing it as kernel of conv and using filter_dilation
upsamp = T.nnet.conv2d(
upsamp = conv2d(
pad_kern,
concat_mat,
border_mode="valid",
......@@ -2048,7 +2046,6 @@ def bilinear_upsampling(
return frac_bilinear_upsampling(input, frac_ratio=frac_ratio)
# the remaining case if integer ratio with use_1D_kernel
T = aesara.tensor
try:
up_bs = batch_size * num_input_channels
except TypeError:
......@@ -2058,11 +2055,11 @@ def bilinear_upsampling(
# concatenating the first and last row and column
# first and last row
concat_mat = T.concatenate(
concat_mat = aet.concatenate(
(up_input[:, :, :1, :], up_input, up_input[:, :, -1:, :]), axis=2
)
# first and last col
concat_mat = T.concatenate(
concat_mat = aet.concatenate(
(concat_mat[:, :, :, :1], concat_mat, concat_mat[:, :, :, -1:]), axis=3
)
concat_col = col + 2
......
......@@ -2081,7 +2081,7 @@ def local_useless_crossentropy_softmax_1hot_with_bias_dx_alloc(fgraph, node):
assert dy.ndim == 1
if dy.owner is not None and isinstance(dy.owner.op, aet.Alloc):
# dz is the input of the Alloc op, i.e. T.alloc(dz, <shape>)
# dz is the input of the Alloc op, i.e. aet.alloc(dz, <shape>)
dz = dy.owner.inputs[0]
try:
......
......@@ -185,10 +185,10 @@ def batch_normalization_train(
axes = (0,) + tuple(range(2, inputs.ndim))
mean = inputs.mean(axes, keepdims=True)
var = inputs.var(axes, keepdims=True)
invstd = T.inv(T.sqrt(var + epsilon))
invstd = aet.inv(aet.sqrt(var + epsilon))
out = (inputs - mean) * gamma * invstd + beta
m = T.cast(T.prod(inputs.shape) / T.prod(mean.shape), 'float32')
m = aet.cast(ate.prod(inputs.shape) / aet.prod(mean.shape), 'float32')
running_mean = running_mean * (1 - running_average_factor) + \\
mean * running_average_factor
running_var = running_var * (1 - running_average_factor) + \\
......@@ -332,9 +332,9 @@ def batch_normalization_test(
axes = (0,)
# for spatial normalization
axes = (0,) + tuple(range(2, inputs.ndim))
gamma, beta, mean, var = (T.addbroadcast(t, *axes)
gamma, beta, mean, var = (aet.addbroadcast(t, *axes)
for t in (gamma, beta, mean, var))
out = (inputs - mean) * gamma / T.sqrt(var + epsilon) + beta
out = (inputs - mean) * gamma / aet.sqrt(var + epsilon) + beta
"""
ndim = inputs.ndim
axes, non_bc_axes = _prepare_batch_normalization_axes(axes, ndim)
......
......@@ -1920,8 +1920,8 @@ class TestConv2dGrads:
def test_conv2d_grad_wrt_inputs(self):
# Compares calculated abstract grads wrt inputs with the fwd grads
# This method checks the outputs of conv2_grad_wrt_inputs against
# the outputs of T.nnet.conv forward grads to make sure the
# This method checks the outputs of `conv2_grad_wrt_inputs` against
# the outputs of `aesara.tensor.nnet.conv` forward grads to make sure the
# results are the same.
for (in_shape, fltr_shape) in zip(self.inputs_shapes, self.filters_shapes):
......@@ -1986,8 +1986,8 @@ class TestConv2dGrads:
def test_conv2d_grad_wrt_weights(self):
# Compares calculated abstract grads wrt weights with the fwd grads
# This method checks the outputs of conv2_grad_wrt_weights against
# the outputs of T.nnet.conv forward grads to make sure the
# This method checks the outputs of `conv2_grad_wrt_weights` against
# the outputs of `aesara.tensor.nnet.conv` forward grads to make sure the
# results are the same.
for (in_shape, fltr_shape) in zip(self.inputs_shapes, self.filters_shapes):
......
......@@ -906,7 +906,7 @@ def test_gemm_nested():
def test_gemm_opt_wishlist():
X, Y, Z, a, b = matrix(), matrix(), matrix(), scalar(), scalar()
# with >2 additions of the same T.dot(X,Y term
# with >2 additions of the same ``aet.dot(X, Y)`` term
just_gemm([X, Y, Z, a, b], [(b * b) * Z * a + (a * a) * dot(X, Y) + b * dot(X, Y)])
just_gemm([X, Y, Z, a, b], [Z + dot(X, Y) + dot(X, Y)])
......
......@@ -842,7 +842,7 @@ class TestAlgebraicCanonize:
# 4 * x / abs(2*x) it get simplifier during canonicalisation.
x = dscalar()
# a = T.abs_(x)
# a = aet.abs_(x)
if config.mode == "FAST_COMPILE":
mode = get_mode("FAST_RUN").excluding("local_elemwise_fusion")
......@@ -2366,7 +2366,6 @@ def test_local_pow_specialize():
assert len(nodes) == 2
assert nodes[0] == sqr
assert isinstance(nodes[1].scalar_op, aes.basic.Inv)
# assert nodes == [T.sqr,T.inv]#Why this don't work?
utt.assert_allclose(f(val_no0), val_no0 ** (-2))
f = function([v], v ** (0.5), mode=mode)
......@@ -2379,7 +2378,6 @@ def test_local_pow_specialize():
assert len(nodes) == 2
assert nodes[0] == sqrt
assert isinstance(nodes[1].scalar_op, aes.basic.Inv)
# assert nodes == [T.sqrt,T.inv]#Why this don't work?
utt.assert_allclose(f(val_no0), val_no0 ** (-0.5))
......
......@@ -127,10 +127,10 @@ class LogisticRegression:
the learning rate is less dependent on the batch size
"""
# y.shape[0] is (symbolically) the number of rows in y, i.e., number of examples (call it n) in the minibatch
# T.arange(y.shape[0]) is a symbolic vector which will contain [0,1,2,... n-1]
# T.log(self.p_y_given_x) is a matrix of Log-Probabilities (call it LP) with one row per example and one column per class
# LP[T.arange(y.shape[0]),y] is a vector v containing [LP[0,y[0]], LP[1,y[1]], LP[2,y[2]], ..., LP[n-1,y[n-1]]]
# and T.mean(LP[T.arange(y.shape[0]),y]) is the mean (across minibatch examples) of the elements in v,
# aet.arange(y.shape[0]) is a symbolic vector which will contain [0,1,2,... n-1]
# aet.log(self.p_y_given_x) is a matrix of Log-Probabilities (call it LP) with one row per example and one column per class
# LP[aet.arange(y.shape[0]),y] is a vector v containing [LP[0,y[0]], LP[1,y[1]], LP[2,y[2]], ..., LP[n-1,y[n-1]]]
# and aet.mean(LP[aet.arange(y.shape[0]),y]) is the mean (across minibatch examples) of the elements in v,
# i.e., the mean log-likelihood across the minibatch.
return log(self.p_y_given_x[aet.arange(y.shape[0]), y])
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论