# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements.  See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership.  The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License.  You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.

import copy
import numpy as np
import mxnet as mx
from mxnet import gluon
from numpy.testing import assert_allclose, assert_array_equal
from collections import defaultdict
from mxnet.test_utils import *
from mxnet.base import _as_list
from mxnet.attribute import AttrScope
from common import with_seed


@with_seed()
def test_while_loop_simple_forward():

    class _TestBlock(gluon.HybridBlock):

        def __init__(self, cond, func, max_iterations):
            super(_TestBlock, self).__init__()
            self.cond = cond
            self.func = func
            self.max_iterations = max_iterations

        def hybrid_forward(self, F, *loop_vars):
            return F.contrib.while_loop(
                cond=self.cond,
                func=self.func,
                loop_vars=loop_vars,
                max_iterations=self.max_iterations
            )

    for hybridize in [False, True]:
        # Case 1.1: result should be sum([1, 2, 3 ... 100])
        model = _TestBlock(
            cond=lambda i, s: i <= 5,
            func=lambda i, s: (None, (i + 1, s + i)),
            max_iterations=10,
        )
        if hybridize:
            model.hybridize()
        _, result = model(
            mx.nd.array([1], dtype="int64"), # i
            mx.nd.array([0], dtype="int64"), # s
        )
        assert result[0].asscalar() == 6
        assert result[1].asscalar() == 15
        # Case 1.2: result should be sum([1, 2, 3 ... 1000])
        model = _TestBlock(
            cond=lambda i, s, true: true,
            func=lambda i, s, true: (None, (i + 1, s + i, true)),
            max_iterations=1000,
        )
        if hybridize:
            model.hybridize()
        _, result = model(
            mx.nd.array([1], dtype="int64"), # i
            mx.nd.array([0], dtype="int64"), # s
            mx.nd.array([1], dtype="int64"), # true
        )
        assert result[0].asscalar() == 1001
        assert result[1].asscalar() == 500500
        assert result[2].asscalar() == 1
        # Case 1.3: result should be sum([])
        model = _TestBlock(
            cond=lambda i, s, false: false,
            func=lambda i, s, false: (None, (i + 1, s + i, false)),
            max_iterations=1000,
        )
        if hybridize:
            model.hybridize()
        _, result = model(
            mx.nd.array([1], dtype="int64"), # i
            mx.nd.array([0], dtype="int64"), # s
            mx.nd.array([0], dtype="int64"), # false
        )
        assert result[0].asscalar() == 1
        assert result[1].asscalar() == 0
        assert result[2].asscalar() == 0
        # Case 2.1: result should be sum([1, 2, 3 ... 100])
        model = _TestBlock(
            cond=lambda i, s: i <= 100,
            func=lambda i, s: (i, (i + 1, s + i)),
            max_iterations=1000,
        )
        if hybridize:
            model.hybridize()
        outputs, (result_i, result_s) = model(
            mx.nd.array([1], dtype="int64"), # i
            mx.nd.array([0], dtype="int64"), # s
        )
        assert all(outputs.asnumpy()[ : 100] == np.arange(1, 101).reshape(100, 1))
        assert result_i.asscalar() == 101
        assert result_s.asscalar() == 5050
        # Case 2.2: result should be sum([1, 2, 3 ... 1000])
        model = _TestBlock(
            cond=lambda i, s, true: true,
            func=lambda i, s, true: (i, (i + 1, s + i, true)),
            max_iterations=1000,
        )
        if hybridize:
            model.hybridize()
        outputs, (result_i, result_s, _) = model(
            mx.nd.array([1], dtype="int64"), # i
            mx.nd.array([0], dtype="int64"), # s
            mx.nd.array([1], dtype="int64"), # true
        )
        assert all(outputs.asnumpy() == np.arange(1, 1001).reshape(1000, 1))
        assert result_i.asscalar() == 1001
        assert result_s.asscalar() == 500500
        # Case 2.3: a corner case, in which loop body is never executed
        model = _TestBlock(
            cond=lambda i, s, false: false,
            func=lambda i, s, false: (i, (i + 1, s + i, false)),
            max_iterations=1000,
        )
        if hybridize:
            model.hybridize()
        _, (result_i, result_s, _) = model(
            mx.nd.array([1], dtype="int64"), # i
            mx.nd.array([0], dtype="int64"), # s
            mx.nd.array([0], dtype="int64"), # false
        )
        assert result_i.asscalar() == 1
        assert result_s.asscalar() == 0


def _verify_while_loop(cond, func, loop_var_shapes, free_var_shapes, is_train, max_iterations, is_for, n_steps):

    def _create_vars(num, prefix):
        return [mx.sym.var(prefix + str(i)) for i in range(num)]

    def _create_arrays(shapes):
        return [mx.nd.random.uniform(-1.0, 1.0, shape=x) for x in shapes]

    def _create_dict(prefix, shapes):
        return {prefix + str(i): mx.nd.random.uniform(-1.0, 1.0, shape=x) for i, x in enumerate(shapes)}

    def _merge_dict(*dicts):
        result = {}
        for item in dicts:
            result.update(item)
        return result

    def _to_numpy_list(arrays):
        return [x.asnumpy() if x is not None else x for x in arrays]

    def _get_imperative_result(n_steps):
        free_vars = [args["FreeVar" + str(i)].copy() for i, _ in enumerate(free_var_shapes)]
        loop_vars = [args["LoopVar" + str(i)].copy() for i, _ in enumerate(loop_var_shapes)]
        loop_var_start = int(is_for)
        if is_train:
            for var in free_vars + loop_vars[loop_var_start: ]:
                var.attach_grad()
        with mx.autograd.record(train_mode=is_train):
            outputs, final_loop_vars = mx.nd.contrib.while_loop(
                cond=lambda *_loop_vars: cond(_loop_vars, free_vars),
                func=lambda *_loop_vars: func(_loop_vars, free_vars),
                loop_vars=loop_vars,
                max_iterations=max_iterations,
            )
            outputs = _as_list(outputs)
            final_loop_vars = _as_list(final_loop_vars)
            outputs = [x[: n_steps] for x in outputs]
            out_grads = _create_arrays(x.shape for x in outputs)  \
                      + _create_arrays(x.shape for x in final_loop_vars)
            loop_result_nd = [x * 2 for x in outputs] + [x * 3 for x in final_loop_vars]
            grads = []
            if is_train:
                cat_out = mx.nd.concat(*[x.reshape(-1) for x in loop_result_nd], dim=0)
                cat_out.backward(out_grad=mx.nd.concat(*[x.reshape(-1) for x in out_grads], dim=0))
                grads = [free_vars[i].grad for i, _ in enumerate(free_var_shapes)] \
                      + [loop_vars[i].grad for i, _ in enumerate(loop_var_shapes) if i >= loop_var_start]
            return _to_numpy_list(loop_result_nd), _to_numpy_list(grads), out_grads

    def _get_symbolic_result(out_grads, n_steps):

        def _copy_args_dict(name_list):
            return {name: args[name].copy() for name in name_list}

        def _zeros_like_dict(name_list):
            return {name: mx.nd.zeros_like(args[name]) for name in name_list}

        free_syms = _create_vars(len(free_var_shapes), "FreeVar")
        loop_syms = _create_vars(len(loop_var_shapes), "LoopVar")
        outputs, final_loop_syms = mx.sym.contrib.while_loop(
            cond=lambda *_loop_vars: cond(_loop_vars, free_syms),
            func=lambda *_loop_vars: func(_loop_vars, free_syms),
            loop_vars=loop_syms,
            max_iterations=max_iterations,
        )
        outputs = _as_list(outputs)
        final_loop_syms = _as_list(final_loop_syms)
        if n_steps == 0:
            outputs = []
        else:
            outputs = [x.slice_axis(axis=0, begin=0, end=n_steps) for x in outputs]
        loop_result_sym = [x * 2 for x in outputs] + [x * 3 for x in final_loop_syms]
        loop_result_sym = mx.sym.Group(loop_result_sym)

        loop_var_start = int(is_for)
        args_names = ["FreeVar" + str(i) for i, _ in enumerate(free_var_shapes)] \
                   + ["LoopVar" + str(i) for i, _ in enumerate(loop_var_shapes) if i >= loop_var_start]
        args_grad = None if not is_train else _zeros_like_dict(x for x in args_names)
        executor = loop_result_sym.bind(
            ctx=default_context(),
            args=_copy_args_dict(loop_result_sym.list_inputs()),
            args_grad=args_grad,
        )
        loop_result_nd = executor.forward(is_train=is_train)
        grads = []
        if is_train:
            executor.backward(out_grads=out_grads)
            grads = [executor.grad_dict.get("FreeVar" + str(i), None) for i, _ in enumerate(free_var_shapes)] \
                  + [executor.grad_dict.get("LoopVar" + str(i), None) for i, _ in enumerate(loop_var_shapes) if i >= loop_var_start]
        return _to_numpy_list(loop_result_nd), _to_numpy_list(grads)

    args = _merge_dict(
        _create_dict("FreeVar", free_var_shapes),
        _create_dict("LoopVar", loop_var_shapes),
    )
    if is_for:
        assert loop_var_shapes[0] == (1, )
        args["LoopVar0"] = mx.nd.array([0])
    imp_outs, imp_grads, out_grads = _get_imperative_result(n_steps)
    sym_outs, sym_grads = _get_symbolic_result(out_grads, n_steps)
    for imp_out, sym_out in zip(imp_outs, sym_outs):
        if imp_out is None or sym_out is None:
            continue
        assert_almost_equal(imp_out, sym_out, rtol=1e-3, atol=1e-3)
    for imp_grad, sym_grad in zip(imp_grads, sym_grads):
        if imp_grad is None or sym_grad is None:
            continue
        assert_almost_equal(imp_grad, sym_grad, rtol=1e-3, atol=1e-3)


@with_seed()
def test_while_loop_for_foreach():

    def make_true_cond():
        return lambda loop_vars, _: (loop_vars[0] < 1e35).prod()

    def make_false_cond():
        return lambda loop_vars, _: (loop_vars[0] > 1e35).prod()

    def make_for_cond(length):
        return lambda loop_vars, _: loop_vars[0] < length

    def case_0():
        # This is a simple testcase that all loop steps are independent'
        # It basically scans the array and outputs itself
        # There is 1 output
        # There is 1 state: i
        def _simple_func(loop, free):
            (i, ), (scanned, ) = loop, free
            in_ = scanned.take(i).squeeze(axis=0)
            return (in_, i + 1)
        _verify_while_loop(
            cond=make_true_cond(),
            func=_simple_func,
            max_iterations=1,
            is_train=True,
            is_for=True,
            loop_var_shapes=[
                (1, ),          # i
            ],
            free_var_shapes=[
                (1, 3),         # scanned
            ],
            n_steps=1,
        )

    def case_1(**params):
        # This is a simple testcase that simulates a cumulative sum
        # There is 1 output
        # There is 1 state: s
        step_funcs = [
            lambda a, b, s: s,
            lambda a, b, s: a * 1.5 + b * 2.5 - s * 3.5,
            lambda a, b, s: a * 1.5 - s * 3.5 + b * 2.5,
            lambda a, b, s: b * 2.5 + a * 1.5 - s * 3.5,
            lambda a, b, s: b * 2.5 - s * 3.5 + a * 1.5,
            lambda a, b, s: s * -3.5 + a * 1.5 + b * 2.5,
            lambda a, b, s: s * -3.5 + b * 2.5 + a * 1.5,
            lambda a, b, s: a * 2.5 * b + s * 0.3,
            lambda a, b, s: b * 2.5 * a + s * 0.3,
            lambda a, b, s: 2.5 * a * b + s * 0.3,
            lambda a, b, s: b * a * 2.5 + s * 0.3,
            lambda a, b, s: 2.5 * b * a + s * 0.3,
            lambda a, b, s: b * a * 2.5 + s * 0.3,
            lambda a, b, s: s * 0.3 + a * 2.5 * b,
            lambda a, b, s: s * 0.3 + b * 2.5 * a,
            lambda a, b, s: s * 0.3 + 2.5 * a * b,
            lambda a, b, s: s * 0.3 + b * a * 2.5,
            lambda a, b, s: s * 0.3 + 2.5 * b * a,
            lambda a, b, s: s * 0.3 + b * a * 2.5,
        ]
        def make_func(step_func):
            def step(loop, free):
                (s, ), (a, b) = loop, free
                out = step_func(a, b, s)
                return (out, out)
            return step
        case_id = 0
        for is_train in [True, False]:
            for step_func in step_funcs:
                case_id += 1
                _verify_while_loop(
                    func=make_func(step_func),
                    is_train=is_train,
                    is_for=False,
                    **params
                )

    def case_2(**params):
        # This is a testcase that involves non-differentiable operators
        # There is 1 output
        # There is 2 states: i, s
        step_funcs = [
            lambda in_, s, f_1: (in_ * 2) * s * f_1,
            lambda in_, s, f_1: (in_ * 2) * f_1 * s,
            lambda in_, s, f_1: s * (in_ * 2) * f_1,
            lambda in_, s, f_1: s * f_1 * (in_ * 2),
            lambda in_, s, f_1: f_1 * (in_ * 2) * s,
            lambda in_, s, f_1: f_1 * s * (in_ * 2),
            lambda in_, s, f_1: (2 * in_) * s * f_1,
            lambda in_, s, f_1: (2 * in_) * f_1 * s,
            lambda in_, s, f_1: s * (2 * in_) * f_1,
            lambda in_, s, f_1: s * f_1 * (2 * in_),
            lambda in_, s, f_1: f_1 * (2 * in_) * s,
            lambda in_, s, f_1: f_1 * s * (2 * in_),
        ]
        def make_func(step_func):
            """This simulates:
            def compute(s, inputs, f_1, length):
                outputs = []
                for i in range(length):
                    s += inputs[i] * 2 + f_1
                    outputs.append(s)
                return outputs, s
            """
            def step(loop, free):
                (i, s), (scanned, f_1, _) = loop, free
                in_ = scanned.take(i).squeeze(axis=0)
                out = step_func(in_, s, f_1)
                return (out, (i + 1, out))
            return step
        case_id = 0
        for is_train in [True, False]:
            for step_func in step_funcs:
                case_id += 1
                _verify_while_loop(
                    func=make_func(step_func),
                    max_iterations=1000,
                    is_train=is_train,
                    is_for=True,
                    **params
                )

    def case_3(length, **params):
        # This is a testcase for multiple non-differentiable operators and different ways of slicing
        # There are 2 outputs
        # There are 3 states: i, s_0, s_1
        step_funcs = [
            lambda i_0, i_1, s_0, s_1, f_0: i_0 * (i_1 * 2) * s_0 * (s_1 * 2) * f_0,
            lambda i_0, i_1, s_0, s_1, f_0: i_0 * (i_1 * 2) * s_0 * f_0 * (s_1 * 2),
            lambda i_0, i_1, s_0, s_1, f_0: i_0 * (i_1 * 2) * (s_1 * 2) * s_0 * f_0,
            lambda i_0, i_1, s_0, s_1, f_0: i_0 * (i_1 * 2) * (s_1 * 2) * f_0 * s_0,
            lambda i_0, i_1, s_0, s_1, f_0: (i_1 * 2) * i_0 * s_0 * (s_1 * 2) * f_0,
            lambda i_0, i_1, s_0, s_1, f_0: (i_1 * 2) * i_0 * s_0 * f_0 * (s_1 * 2),
            lambda i_0, i_1, s_0, s_1, f_0: (i_1 * 2) * i_0 * (s_1 * 2) * s_0 * f_0,
            lambda i_0, i_1, s_0, s_1, f_0: (i_1 * 2) * i_0 * (s_1 * 2) * f_0 * s_0,
            lambda i_0, i_1, s_0, s_1, f_0: i_0,
            lambda i_0, i_1, s_0, s_1, f_0: i_1,
            lambda i_0, i_1, s_0, s_1, f_0: s_0,
            lambda i_0, i_1, s_0, s_1, f_0: s_1,
            lambda i_0, i_1, s_0, s_1, f_0: f_0,
        ]
        def make_func(step_func):
            """This simulates:
            def compute(input_0, input_1, s_0, s_1, f_0, length):
                output_0 = []
                output_1 = []
                for i in range(length):
                    i_0 = input_0[i]
                    i_1 = input_1[length - 1 - i]
                    out = i_0 + (i_1 * 2) + s_0 + (s_1 * 2) + f_0
                    s_0 = (s_0 + out) * 1.05
                    s_1 = (s_1 - out * 0.5) * 0.95
                    output_0.append(out)
                    output_1.append(out * 1.5)
                return outputs, s_0, s_1
            """
            def step(loop, free):
                (i, s_0, s_1), (sc_0, sc_1, f_0, _) = loop, free
                i_0 = sc_0.take(i).squeeze(axis=0)
                i_1 = sc_1.take(length - 1 - i).squeeze(axis=0)
                out = step_func(i_0, i_1, s_0, s_1, f_0)
                return ([out, out * 1.5], [i + 1, (s_0 + out) * 1.05, (s_1 - out * 0.5) * 0.95])
            return step
        case_id = 0
        for is_train in [True, False]:
            for step_func in step_funcs:
                case_id += 1
                _verify_while_loop(
                    func=make_func(step_func),
                    max_iterations=1000,
                    is_train=is_train,
                    is_for=True,
                    **params
                )

    def case_4(length, single_shape, **params):
        # It is for the case that inputs & outputs are the same
        # There are 3 outputs
        # There are 4 states: i, s_0, s_1, s_2
        # i is used in both non-differentiable (take) and differentiable (+) occasions
        step_funcs = [
            lambda i_0, i_1, s_0, s_1, f_0: i_0 * (i_1 * 2) * s_0 * (s_1 * 2) * f_0,
            lambda i_0, i_1, s_0, s_1, f_0: i_0 * (i_1 * 2) * s_0 * f_0 * (s_1 * 2),
            lambda i_0, i_1, s_0, s_1, f_0: i_0 * (i_1 * 2) * (s_1 * 2) * s_0 * f_0,
            lambda i_0, i_1, s_0, s_1, f_0: i_0 * (i_1 * 2) * (s_1 * 2) * f_0 * s_0,
            lambda i_0, i_1, s_0, s_1, f_0: (i_1 * 2) * i_0 * s_0 * (s_1 * 2) * f_0,
            lambda i_0, i_1, s_0, s_1, f_0: (i_1 * 2) * i_0 * s_0 * f_0 * (s_1 * 2),
            lambda i_0, i_1, s_0, s_1, f_0: (i_1 * 2) * i_0 * (s_1 * 2) * s_0 * f_0,
            lambda i_0, i_1, s_0, s_1, f_0: (i_1 * 2) * i_0 * (s_1 * 2) * f_0 * s_0,
            lambda i_0, i_1, s_0, s_1, f_0: i_0,
            lambda i_0, i_1, s_0, s_1, f_0: i_1,
            lambda i_0, i_1, s_0, s_1, f_0: s_0,
            lambda i_0, i_1, s_0, s_1, f_0: s_1,
            lambda i_0, i_1, s_0, s_1, f_0: f_0,
        ]
        def make_func(step_func):
            """This simulates:
            def compute(input_0, input_1, s_0, s_1, s_2, f_0, length):
                # here s_2 remains untouched
                output_0 = []
                output_1 = []
                output_2 = []
                for i in range(length):
                    i_0 = input_0[i]
                    i_1 = input_1[length - 1 - i]
                    out = i_0 + (i_1 * 2) + s_0 + (s_1 * 2) + f_0
                    out = out * i * i_0 * i_1
                    s_0 = (s_0 + out) * 1.05
                    s_1 = (s_1 - out * 0.5) * 0.95
                    output_0.append(out)
                    output_1.append(f_0)
                    output_2.append(out * 1.5)
                return output_0, output_1, output_2, s_0, s_1, s_2
            """
            def step(loop, free):
                (i, s_0, s_1, s_2), (sc_0, sc_1, f_0, _) = loop, free
                i_0 = sc_0.take(i).squeeze(axis=0)
                i_1 = sc_1.take(length - 1 - i).squeeze(axis=0)
                out = step_func(i_0, i_1, s_0, s_1, f_0)
                out = out * i.reshape([1] * len(single_shape)).broadcast_to(single_shape)
                out = out * i_0 * i_1
                return ([out, f_0, out * 1.5], [i + 1, (s_0 + out) * 1.05, (s_1 - out * 0.5) * 0.95, s_2])
            return step
        case_id = 0
        for is_train in [True, False]:
            for step_func in step_funcs:
                case_id += 1
                _verify_while_loop(
                    func=make_func(step_func),
                    max_iterations=1000,
                    is_train=is_train,
                    is_for=True,
                    **params
                )

    def case_5(length, single_shape, **params):
        # It is for the case that inputs & outputs are the same
        # There are 0 outputs
        # There are 4 states: i, s_0, s_1, s_2
        # i is used in both differentiable (take) and non-differentiable (+) occasions
        step_funcs = [
            lambda i_0, i_1, s_0, s_1, f_0: i_0 * (i_1 * 2) * s_0 * (s_1 * 2) * f_0,
            lambda i_0, i_1, s_0, s_1, f_0: i_0 * (i_1 * 2) * s_0 * f_0 * (s_1 * 2),
            lambda i_0, i_1, s_0, s_1, f_0: i_0 * (i_1 * 2) * (s_1 * 2) * s_0 * f_0,
            lambda i_0, i_1, s_0, s_1, f_0: i_0 * (i_1 * 2) * (s_1 * 2) * f_0 * s_0,
            lambda i_0, i_1, s_0, s_1, f_0: (i_1 * 2) * i_0 * s_0 * (s_1 * 2) * f_0,
            lambda i_0, i_1, s_0, s_1, f_0: (i_1 * 2) * i_0 * s_0 * f_0 * (s_1 * 2),
            lambda i_0, i_1, s_0, s_1, f_0: (i_1 * 2) * i_0 * (s_1 * 2) * s_0 * f_0,
            lambda i_0, i_1, s_0, s_1, f_0: (i_1 * 2) * i_0 * (s_1 * 2) * f_0 * s_0,
            lambda i_0, i_1, s_0, s_1, f_0: i_0,
            lambda i_0, i_1, s_0, s_1, f_0: i_1,
            lambda i_0, i_1, s_0, s_1, f_0: s_0,
            lambda i_0, i_1, s_0, s_1, f_0: s_1,
            lambda i_0, i_1, s_0, s_1, f_0: f_0,
        ]
        def make_func(step_func):
            """This simulates:
            def compute(input_0, input_1, s_0, s_1, s_2, f_0, length):
                # here s_2 remains untouched
                output_0 = []
                output_1 = []
                output_2 = []
                for i in range(length):
                    i_0 = input_0[i]
                    i_1 = input_1[length - 1 - i]
                    out = i_0 + (i_1 * 2) + s_0 + (s_1 * 2) + f_0
                    out = out * i * i_0 * i_1
                    s_0 = (s_0 + out) * 1.05
                    s_1 = (s_1 - out * 0.5) * 0.95
                    output_0.append(out)
                    output_1.append(f_0)
                    output_2.append(out * 1.5)
                return output_0, output_1, output_2, s_0, s_1, s_2
            """
            def step(loop, free):
                (i, s_0, s_1, s_2), (sc_0, sc_1, f_0, _) = loop, free
                i_0 = sc_0.take(i).squeeze(axis=0)
                i_1 = sc_1.take(length - 1 - i).squeeze(axis=0)
                out = step_func(i_0, i_1, s_0, s_1, f_0)
                out = out * i.reshape([1] * len(single_shape)).broadcast_to(single_shape)
                out = out * i_0 * i_1
                return ([], [i + 1, (s_0 + out) * 1.05, (s_1 - out * 0.5) * 0.95, s_2])
            return step
        case_id = 0
        for is_train in [True, False]:
            for step_func in step_funcs:
                case_id += 1
                _verify_while_loop(
                    func=make_func(step_func),
                    max_iterations=1000,
                    is_train=is_train,
                    is_for=True,
                    **params
                )

    def case_6(length, single_shape, **params):
        # It is for the case that inputs & outputs are the same
        # There are 3 outputs
        # There are 4 states: i, s_0, s_1, s_2
        # i is used in both differentiable (take) and non-differentiable (+) occasions
        step_funcs = [
            lambda i_0, i_1, s_0, s_1, f_0: i_0 * (i_1 * 2) * s_0 * (s_1 * 2) * f_0,
            lambda i_0, i_1, s_0, s_1, f_0: i_0 * (i_1 * 2) * s_0 * f_0 * (s_1 * 2),
            lambda i_0, i_1, s_0, s_1, f_0: i_0 * (i_1 * 2) * (s_1 * 2) * s_0 * f_0,
            lambda i_0, i_1, s_0, s_1, f_0: i_0 * (i_1 * 2) * (s_1 * 2) * f_0 * s_0,
            lambda i_0, i_1, s_0, s_1, f_0: (i_1 * 2) * i_0 * s_0 * (s_1 * 2) * f_0,
            lambda i_0, i_1, s_0, s_1, f_0: (i_1 * 2) * i_0 * s_0 * f_0 * (s_1 * 2),
            lambda i_0, i_1, s_0, s_1, f_0: (i_1 * 2) * i_0 * (s_1 * 2) * s_0 * f_0,
            lambda i_0, i_1, s_0, s_1, f_0: (i_1 * 2) * i_0 * (s_1 * 2) * f_0 * s_0,
            lambda i_0, i_1, s_0, s_1, f_0: i_0,
            lambda i_0, i_1, s_0, s_1, f_0: i_1,
            lambda i_0, i_1, s_0, s_1, f_0: s_0,
            lambda i_0, i_1, s_0, s_1, f_0: s_1,
            lambda i_0, i_1, s_0, s_1, f_0: f_0,
        ]
        def make_func(step_func):
            """This simulates:
            def compute(input_0, input_1, s_0, s_1, s_2, f_0, length):
                # here s_2 remains untouched
                output_0 = []
                output_1 = []
                output_2 = []
                for i in range(length):
                    i_0 = input_0[i]
                    i_1 = input_1[length - 1 - i]
                    out = i_0 + (i_1 * 2) + s_0 + (s_1 * 2) + f_0
                    out = out * i * i_0 * i_1
                    s_0 = (s_0 + out) * 1.05
                    s_1 = (s_1 - out * 0.5) * 0.95
                    output_0.append(out)
                    output_1.append(f_0)
                    output_2.append(out * 1.5)
                return output_0, output_1, output_2, s_0, s_1, s_2
            """
            def step(loop, free):
                (i, s_0, s_1, s_2), (sc_0, sc_1, f_0, _) = loop, free
                F = mx.sym if isinstance(i, mx.sym.Symbol) else mx.nd
                i_0 = sc_0.take(i).squeeze(axis=0)
                i_1 = sc_1.take(length - 1 - i).squeeze(axis=0)
                out_0 = step_func(i_0, i_1, s_0, s_1, f_0)
                out_0 = out_0 * i.reshape([1] * len(single_shape)).broadcast_to(single_shape)
                out_1 = step_func(i_1, s_0, f_0, s_1, i_0)
                out_1 = out_1 * i.reshape([1] * len(single_shape)).broadcast_to(single_shape)
                return ([F.dot(out_0, s_2), f_0, F.dot(s_2, out_1) * 1.5], [i + 1, (s_0 + out_1) * 1.05, (s_1 - out_0 * 0.5) * 0.95, s_2])
            return step
        case_id = 0
        for is_train in [True, False]:
            for step_func in step_funcs:
                case_id += 1
                _verify_while_loop(
                    func=make_func(step_func),
                    max_iterations=1000,
                    is_train=is_train,
                    is_for=True,
                    **params
                )

    # Case 0: the simpest case
    case_0()
    # Case 1.1.*
    case_1(
        cond=make_true_cond(),
        loop_var_shapes=[
            (1, ),          # s
        ],
        free_var_shapes=[
            (1, ),          # a
            (1, ),          # b
        ],
        max_iterations=5,
        n_steps=5,
    )
    # Case 1.2.*
    case_1(
        cond=make_true_cond(),
        loop_var_shapes=[
            (2, 3, 4),      # s
        ],
        free_var_shapes=[
            (2, 3, 4),      # a
            (2, 3, 4),      # b
        ],
        max_iterations=3,
        n_steps=3,
    )
    # Case 1.3.*
    case_1(
        cond=make_false_cond(),
        loop_var_shapes=[
            (2, 3, 4),      # s
        ],
        free_var_shapes=[
            (2, 3, 4),      # a
            (2, 3, 4),      # b
        ],
        max_iterations=20,
        n_steps=0,
    )
    # Case 2.1.*
    case_2(
        cond=make_for_cond(length=5),
        loop_var_shapes=[
            (1, ),          # i
            (2, ),          # s
        ],
        free_var_shapes=[
            (100, 2),       # scanned
            (2, ),          # f_1
            (3, 4, 5, 6),   # f_2, unused
        ],
        n_steps=5,
    )
    # Case 2.2.*
    case_2(
        cond=make_for_cond(length=3),
        loop_var_shapes=[
            (1, ),          # i
            (2, ),          # s
        ],
        free_var_shapes=[
            (30, 2),        # scanned
            (2, ),          # f_1
            (3, 4, 5, 6),   # f_2, unused
        ],
        n_steps=3,
    )
    # Case 3.*
    case_3(
        length=5,
        cond=make_for_cond(length=5),
        loop_var_shapes=[
            (1, ),          # i
            (2, ),          # s_0
            (2, ),          # s_1
        ],
        free_var_shapes=[
            (30, 2),        # sc_0
            (30, 2),        # sc_1
            (2, ),          # f_0
            (3, 4, 5, 6),   # f_1, unused
        ],
        n_steps=5,
    )
    # Case 4.1.*
    case_4(
        length=4,
        cond=make_for_cond(length=4),
        single_shape=[5],
        loop_var_shapes=[
            (1, ),          # i
            (5, ),          # s_0
            (5, ),          # s_1
            (23, 6, 8),     # s_2
        ],
        free_var_shapes=[
            (30, 5),        # sc_0
            (30, 5),        # sc_1
            (5, ),          # f_0
            (3, 4, 5, 6),   # f_1, unused
        ],
        n_steps=4,
    )
    # Case 4.2.*
    case_4(
        length=5,
        cond=make_for_cond(length=5),
        single_shape=[5, 12],
        loop_var_shapes=[
            (1, ),          # i
            (5, 12),        # s_0
            (5, 12),        # s_1
            (23, 6, 8),     # s_2
        ],
        free_var_shapes=[
            (30, 5, 12),    # sc_0
            (30, 5, 12),    # sc_1
            (5, 12),        # f_0
            (3, 4, 5, 6),   # f_1, unused
        ],
        n_steps=5,
    )
    # Case 5.1.*
    case_5(
        length=4,
        cond=make_for_cond(length=4),
        single_shape=[5],
        loop_var_shapes=[
            (1, ),          # i
            (5, ),          # s_0
            (5, ),          # s_1
            (23, 6, 8),     # s_2
        ],
        free_var_shapes=[
            (30, 5),        # sc_0
            (30, 5),        # sc_1
            (5, ),          # f_0
            (3, 4, 5, 6),   # f_1, unused
        ],
        n_steps=4,
    )
    # Case 5.2.*
    case_5(
        length=5,
        cond=make_for_cond(length=5),
        single_shape=[3, 4, 2],
        loop_var_shapes=[
            (1, ),          # i
            (3, 4, 2),      # s_0
            (3, 4, 2),      # s_1
            (23, 6, 8),     # s_2
        ],
        free_var_shapes=[
            (30, 3, 4, 2),  # sc_0
            (30, 3, 4, 2),  # sc_1
            (3, 4, 2),      # f_0
            (3, 4, 5, 6),   # f_1, unused
        ],
        n_steps=5,
    )
    # Case 6.*
    case_6(
        length=5,
        cond=make_for_cond(length=5),
        single_shape=[5, 3],
        loop_var_shapes=[
            (1, ),          # i
            (5, 3),         # s_0
            (5, 3),         # s_1
            (3, 5),         # s_2
        ],
        free_var_shapes=[
            (30, 5, 3),     # sc_0
            (30, 5, 3),     # sc_1
            (5, 3),         # f_0
            (3, 4, 5, 6),   # f_1, unused
        ],
        n_steps=5,
    )


@with_seed()
def test_while_loop_nested():

    def _to_np_list(arrays):
        return [x.asnumpy() if x is not None else x for x in arrays]

    def _array(shape):
        return mx.nd.random.uniform(-1.0, 1.0, shape=shape)

    def inner_cond(i, j, x_sum, sc):
        return j < 2

    def inner_body(i, j, x_sum, sc):
        x_ij = sc.take(j).squeeze(axis=0)
        return (x_ij, x_ij), (i, j + 1, x_sum, sc)

    def outer_cond(i, j, x_sum, sc):
        return i < 2

    def outer_body(i, j, x_sum, sc):
        F = mx.sym if isinstance(i, mx.sym.Symbol) else mx.nd
        (x_ij, x_ji), (i_p, j_p, x_sum_p, sc_p) = F.contrib.while_loop(
            cond=inner_cond,
            func=inner_body,
            loop_vars=(i, j, x_sum, sc),
            max_iterations=2,
        )
        return (x_ij, x_ji), (i_p + 1, j_p - 2, x_sum_p, sc_p)

    def make_loop(i, j, x_sum, sc):
        F = mx.sym if isinstance(i, mx.sym.Symbol) else mx.nd
        (x_ij, x_ji), (new_i, new_j, x_sum_p, sc_p) = F.contrib.while_loop(
            cond=outer_cond,
            func=outer_body,
            loop_vars=(i, j, x_sum, sc),
            max_iterations=2,
        )
        return new_i, new_j, x_sum_p, sc_p, x_ij, x_ji

    args = {
        "i": mx.nd.array([0]),
        "j": mx.nd.array([0]),
        "x_sum": _array([5, 3]),
        "sc": _array([10, 10, 5, 3]),
    }
    args_grad = {
        "x_sum": _array([5, 3]),
        "sc": _array([10, 10, 5, 3]),
    }
    out_grad = [
        _array([1]),
        _array([1]),
        _array([5, 3]),
        _array([10, 10, 5, 3]),
        _array([2, 2, 10, 5, 3]),
        _array([2, 2, 10, 5, 3]),
    ]
    def _get_imp_result(is_train, args, args_grad, out_grad):
        args = {k: v.copy() for k, v in args.items()}
        args_grad = {k: v.copy() for k, v in args_grad.items()}
        i, j, x_sum, sc = [args[x].copy() for x in ["i", "j", "x_sum", "sc"]]
        if is_train:
            x_sum.attach_grad()
            sc.attach_grad()
        with mx.autograd.record(train_mode=is_train):
            results = make_loop(i, j, x_sum, sc)
            cat_res = mx.nd.concat(*[x.reshape(-1) for x in results], dim=0)
        if not is_train:
            return _to_np_list(results), []
        cat_grad = mx.nd.concat(*[x.reshape(-1) for x in out_grad], dim=0)
        assert cat_grad.shape == cat_res.shape
        cat_res.backward(out_grad=cat_grad)
        grads = [x_sum.grad, sc.grad]
        return _to_np_list(results), _to_np_list(grads)

    def _get_sym_result(is_train, args, args_grad, out_grad):
        args = {k: v.copy() for k, v in args.items()}
        args_grad = {k: v.copy() for k, v in args_grad.items()}
        i, j, x_sum, sc = [
            mx.sym.var("i"),
            mx.sym.var("j"),
            mx.sym.var("x_sum"),
            mx.sym.var("sc"),
        ]
        result_sym = mx.sym.Group(make_loop(i, j, x_sum, sc))
        executor = result_sym.bind(
            ctx=default_context(),
            args=args,
            args_grad=args_grad,
        )
        results = executor.forward(is_train=is_train)
        if not is_train:
            return _to_np_list(results), []
        executor.backward(out_grads=out_grad)
        grads = [executor.grad_dict["x_sum"], executor.grad_dict["sc"]]
        return _to_np_list(results), _to_np_list(grads)

    for is_train in [True, False]:
        imp_out, imp_grad = _get_imp_result(is_train=is_train, args=args, args_grad=args_grad, out_grad=out_grad)
        sym_out, sym_grad = _get_sym_result(is_train=is_train, args=args, args_grad=args_grad, out_grad=out_grad)
        assert len(imp_out) == len(sym_out)
        assert len(imp_grad) == len(sym_grad)
        for x, y in zip(imp_out, sym_out):
            assert_almost_equal(x, y, rtol=1e-3, atol=1e-3)
        for x, y in zip(imp_grad, sym_grad):
            assert_almost_equal(x, y, rtol=1e-3, atol=1e-3)


@with_seed()
def test_while_loop_rnn():
    def _array(shape):
        return mx.nd.random.uniform(-1.0, 1.0, shape=shape)

    cell_types = [mx.rnn.LSTMCell]
    num_params = [2]

    batch_size = 2
    hidden_dim = 3
    input_dim = 4
    seq_len = 3

    for cell, n_param in zip(cell_types, num_params):
        # using while_loop
        params = mx.rnn.RNNParams()
        data = mx.sym.var("data")
        iter_i = mx.sym.var("i")
        def _cond(*states):
            i = states[0]
            return i < seq_len
        def _func(*states):
            i = states[0]
            states = states[1:]
            in_ = data.take(i).squeeze(axis=0)
            rnn = cell(hidden_dim, prefix='', params=params)
            next_hidden, next_states = rnn(in_, states)
            return [next_hidden], [i + 1] + list(next_states)
        states = [mx.sym.var("s_" + str(i)) for i in range(n_param)]
        result = mx.sym.contrib.while_loop(
                    cond=_cond,
                    func=_func,
                    loop_vars=[iter_i] + states,
                    max_iterations=seq_len
                )
        result = mx.sym.Group(result[0] + result[1][1: ])
        arg_shapes, _, _ = result.infer_shape(
            data=(seq_len, batch_size, input_dim),
            s_0=(batch_size, hidden_dim),
        )
        rnn_inputs = result.list_inputs()
        args = {name: _array(arg_shapes[i]) for i, name in enumerate(rnn_inputs) if name != "i"}
        args["i"] = mx.nd.zeros([1])
        args_grad = {name: _array(arg_shapes[i]) for i, name in enumerate(rnn_inputs)}
        e_1 = result.bind(ctx=default_context(),
            args={name: array.copy() for name, array in args.items()},
            args_grad={name: array.copy() for name, array in args_grad.items() if name != "i"},
        )
        # using unrolled rnn
        rnn = cell(hidden_dim, prefix='')
        unroll_outs = []
        for inputs in mx.sym.split(data, num_outputs=seq_len, axis=0, squeeze_axis=True):
            h, states = rnn(inputs, states)
            unroll_outs.append(mx.sym.expand_dims(h, axis=0))
        unroll_outs = _as_list(mx.sym.concat(*unroll_outs, dim=0))
        unroll_outs.extend(states)
        result = mx.sym.Group(unroll_outs)
        e_2 = result.bind(ctx=default_context(),
            args={name: array.copy() for name, array in args.items() if name != "i"},
            args_grad={name: array.copy() for name, array in args_grad.items() if name != "i"},
        )
        for case_id in range(100):
            out_grads = [_array(arr.shape) for arr in e_1.outputs]
            args = {name: array.copy() for name, array in args.items()}
            e_1.forward(is_train=True, **args)
            e_1.backward(out_grads)
            args = {name: array.copy() for name, array in args.items() if name != "i"}
            e_2.forward(is_train=True, **args)
            e_2.backward(out_grads)
            assert len(e_1.outputs) == len(e_2.outputs)
            for x, y in zip(e_1.outputs, e_2.outputs):
                x = x.asnumpy()
                y = y.asnumpy()
                assert_almost_equal(x, y, rtol=1e-3, atol=1e-3)
            grad_keys = list(e_2.grad_dict.keys())
            e_1_grad = [e_1.grad_dict[x] for x in grad_keys]
            e_2_grad = [e_2.grad_dict[x] for x in grad_keys]
            for x, y in zip(e_1_grad, e_2_grad):
                x = x.asnumpy()
                y = y.asnumpy()
                assert_almost_equal(x, y, rtol=1e-3, atol=1e-3)

def _verify_cond(cond_func, then_func, else_func, input_var_shapes, free_var_shapes, is_train):

    def _create_symbol(prefix, i):
        return mx.sym.var(prefix + str(i))

    def _create_array(shape):
        return mx.nd.random.uniform(-1.0, 1.0, shape=shape)

    def _to_numpy_list(arrays):
        return [x.asnumpy() if x is not None else x for x in arrays]

    def _merge_dict(*dicts):
        result = {}
        for item in dicts:
            result.update(item)
        return result

    _input_syms = [_create_symbol("InputVar", i) for i, _ in enumerate(input_var_shapes)]
    _free_syms = [_create_symbol("FreeVar", i) for i, _ in enumerate(free_var_shapes)]
    _input_vars = [_create_array(x) for x in input_var_shapes]
    _free_vars = [_create_array(x) for x in free_var_shapes]
    _args_dict = _merge_dict(
        {"InputVar" + str(i): x for i, x in enumerate(_input_vars)},
        {"FreeVar" + str(i): x for i, x in enumerate(_free_vars)},
    )

    def _get_imperative_result():
        free_vars = [x.copy() for x in _free_vars]
        input_vars = [x.copy() for x in _input_vars]
        out_grads = []
        if is_train:
            for var in free_vars + input_vars:
                var.attach_grad()
        with mx.autograd.record(train_mode=is_train):
            outputs = mx.nd.contrib.cond(
                pred=cond_func(input_vars, free_vars),
                then_func=lambda: then_func(input_vars, free_vars),
                else_func=lambda: else_func(input_vars, free_vars),
            )
            outputs = _as_list(outputs)
            outputs = [x * 2 for x in outputs]
            grads = []
            if is_train:
                out_grads = [_create_array(x.shape) for x in outputs]
                cat_out = mx.nd.concat(*[x.reshape(-1) for x in outputs], dim=0)
                cat_out.backward(out_grad=mx.nd.concat(*[x.reshape(-1) for x in out_grads], dim=0))
                grads = [free_vars[i].grad for i, _ in enumerate(free_var_shapes)] \
                      + [input_vars[i].grad for i, _ in enumerate(input_var_shapes)]
            return _to_numpy_list(outputs), _to_numpy_list(grads), out_grads

    def _get_symbolic_result(out_grads):
        outputs_sym = mx.sym.contrib.cond(
            pred=cond_func(_input_syms, _free_syms),
            then_func=lambda: then_func(_input_syms, _free_syms),
            else_func=lambda: else_func(_input_syms, _free_syms),
        )
        outputs_sym = _as_list(outputs_sym)
        outputs_sym = [x * 2 for x in outputs_sym]
        outputs_sym = mx.sym.Group(outputs_sym)
        executor = outputs_sym.bind(
            ctx=default_context(),
            args={name: _args_dict[name].copy() for name in outputs_sym.list_inputs()},
            args_grad=None if not is_train else _merge_dict(
                {"InputVar" + str(i): mx.nd.zeros(s) for i, s in enumerate(input_var_shapes)},
                {"FreeVar" + str(i): mx.nd.zeros(s) for i, s in enumerate(free_var_shapes)},
            ),
        )
        outputs = executor.forward(is_train=is_train)
        grads = []
        if is_train:
            executor.backward(out_grads=out_grads)
            grads = [executor.grad_dict.get("FreeVar" + str(i), None) for i, _ in enumerate(free_var_shapes)] \
                  + [executor.grad_dict.get("InputVar" + str(i), None) for i, _ in enumerate(input_var_shapes)]
        return _to_numpy_list(outputs), _to_numpy_list(grads)

    imp_outs, imp_grads, out_grads = _get_imperative_result()
    sym_outs, sym_grads = _get_symbolic_result(out_grads)
    for imp_out, sym_out in zip(imp_outs, sym_outs):
        if imp_out is None or sym_out is None:
            continue
        assert_almost_equal(imp_out, sym_out, rtol=1e-3, atol=1e-3)
    for imp_grad, sym_grad in zip(imp_grads, sym_grads):
        if imp_grad is None or sym_grad is None:
            continue
        assert_almost_equal(imp_grad, sym_grad, rtol=1e-3, atol=1e-3)


@with_seed()
def test_cond():
    # whether there are free variables in three graphs
    # whether these three graphs contain input_vars
    # whether to use all input_vars
    # which branch to choose
    def run_case(cond_func, then_func, else_func, **params):
        def make_cond(is_inverse):
            def cond(inputs, free):
                x = cond_func(inputs, free)
                if is_inverse:
                    if isinstance(x, mx.sym.Symbol):
                        return mx.sym.logical_not(x)
                    else:
                        return mx.nd.logical_not(x)
                return x
            return cond
        for is_train in [True, False]:
            for is_inverse in [False, True]:
                _verify_cond(
                    cond_func=make_cond(is_inverse),
                    then_func=then_func,
                    else_func=else_func,
                    is_train=is_train,
                    **params
                )
    # Each function can
    # 1. use_free_vars or not: T/F
    # 2. use_input_vars or not: T/F
    # 3. use_all_input_vars or not: T/F
    # (a, b, c) are inputs, (d, e, f) are free_vars
    cond_funcs = [
        lambda a, b, c, d, e, f: (a * b).sum() < 0.5,               # F, T, F
        lambda a, b, c, d, e, f: (a + b + c).sum() < 0.5,           # F, T, T
        lambda a, b, c, d, e, f: (d + e).sum() < 0.5,               # T, F, F
        lambda a, b, c, d, e, f: (d + e * a).sum() < 0.5,           # T, T, F
        lambda a, b, c, d, e, f: (d + e * a + b * c).sum() < 0.5,   # T, T, T
    ]
    body_funcs = [
        lambda a, b, c, d, e, f: a * b,                             # F, T, F
        lambda a, b, c, d, e, f: a * b * c,                         # F, T, T
        lambda a, b, c, d, e, f: d * e,                             # T, F, F
        lambda a, b, c, d, e, f: d * e * a,                         # T, T, F
        lambda a, b, c, d, e, f: d * e * a * b * c,                 # T, T, T
        # some extra tests
        lambda a, b, c, d, e, f: b * c,
        lambda a, b, c, d, e, f: a * c,
        lambda a, b, c, d, e, f: (a + b) * c,
        lambda a, b, c, d, e, f: c * (b - a),
    ]
    # enumerate all kinds of possible combinations
    for cond_func in cond_funcs:
        for then_func in body_funcs:
            for else_func in body_funcs:
                run_case(
                    cond_func=lambda x, y: cond_func(x[0], x[1], x[2], y[0], y[1], y[2]),
                    then_func=lambda x, y: then_func(x[0], x[1], x[2], y[0], y[1], y[2]),
                    else_func=lambda x, y: else_func(x[0], x[1], x[2], y[0], y[1], y[2]),
                    input_var_shapes=[
                        (2, 3),
                        (2, 3),
                        (2, 3),
                    ],
                    free_var_shapes=[
                        (2, 3),
                        (2, 3),
                        (2, 3),
                    ]
                )

class TestRNNLayer(gluon.HybridBlock):
    def __init__(self, cell_type, hidden_size, prefix=None, params=None):
        super(TestRNNLayer, self).__init__(prefix=prefix, params=params)
        self.cell = cell_type(hidden_size, prefix='rnn_')

    def hybrid_forward(self, F, inputs, states):
        out, states = F.contrib.foreach(self.cell, inputs, states)
        return out

def check_contrib_rnn(cell_type, num_states):
    batch_size = 10
    hidden_size = 100
    rnn_data = mx.nd.normal(loc=0, scale=1, shape=(5, batch_size, 50))
    state_shape = (batch_size, hidden_size)
    states = [mx.nd.normal(loc=0, scale=1, shape=state_shape) for i in range(num_states)]
    layer = TestRNNLayer(cell_type, hidden_size)
    layer.initialize(ctx=default_context())
    res1 = layer(rnn_data, states)
    params1 = layer.collect_params()
    orig_params1 = copy.deepcopy(params1)

    trainer = gluon.Trainer(params1, 'sgd', {'learning_rate' : 0.03})
    with mx.autograd.record():
        res1 = layer(rnn_data, states)
    res1.backward()
    trainer.step(batch_size)

    configs = [
            {},
            {'inline_limit': 0},
            {'static_alloc': True},
            {'static_alloc': True, 'static_shape': True} ]
    for config in configs:
        layer = TestRNNLayer(cell_type, hidden_size)
        layer.initialize(ctx=default_context())
        layer.hybridize(**config)
        res2 = layer(rnn_data, states)
        params2 = layer.collect_params()
        for key, val in orig_params1.items():
            params2[key].set_data(copy.deepcopy(val.data()))

        trainer = gluon.Trainer(params2, 'sgd', {'learning_rate' : 0.03})
        with mx.autograd.record():
            res2 = layer(rnn_data, states)
        assert_almost_equal(res1.asnumpy(), res2.asnumpy(), rtol=1e-3, atol=1e-3)
        res2.backward()
        trainer.step(batch_size)

        for key, val in params1.items():
            weight1 = val.data()
            weight2 = params2[key].data()
            assert_almost_equal(weight1.asnumpy(), weight2.asnumpy(),
                    rtol=1e-3, atol=1e-3)


@with_seed()
def test_contrib_rnn():
    cell_types = [(gluon.rnn.RNNCell, 1), (gluon.rnn.LSTMCell, 2),
            (gluon.rnn.GRUCell, 1)]
    for cell_type, num_states in cell_types:
        check_contrib_rnn(cell_type, num_states)


@with_seed()
def test_foreach():
    v3 = mx.sym.var("v0")
    v4 = mx.sym.var("v1")
    v5 = mx.sym.var("v2")
    v6 = mx.sym.var("v3")
    v7 = mx.sym.var("v4")
    v8 = mx.sym.var("v5")

    def verify_foreach(step, in_syms, state_syms, free_syms,
            in_arrs, init_states, frees, out_grads, is_train=True,
            free_vars_func=None, num_iters=1):
        step_sym = lambda in_syms, state_syms : step(in_syms, state_syms, free_syms)
        res, states = mx.sym.contrib.foreach(step_sym, in_syms, state_syms)
        out = _as_list(res)
        num_outputs = len(out)
        for i in range(num_outputs):
            out[i] = out[i] * 2
        out.extend(states)
        out = mx.sym.Group(out)
        js_1 = out.tojson()
        out = mx.sym.load_json(js_1)
        js_2 = out.tojson()
        assert js_1 == js_2
        arr_grads = []
        arg_dict = {}
        arg_grad_dict = {}
        i = 0
        for arr in _as_list(in_arrs):
            arr_grad = mx.nd.empty(arr.shape)
            arr_grads.append(arr_grad)
            arg_dict['v'+str(i)] = arr
            arg_grad_dict['v'+str(i)] = arr_grad
            i = i + 1
        for arr in init_states:
            arr_grad = mx.nd.empty(arr.shape)
            arr_grads.append(arr_grad)
            arg_dict['v'+str(i)] = arr
            arg_grad_dict['v'+str(i)] = arr_grad
            i = i + 1
        for arr in frees:
            arr_grad = mx.nd.empty(arr.shape)
            arr_grads.append(arr_grad)
            arg_dict['v'+str(i)] = arr
            arg_grad_dict['v'+str(i)] = arr_grad
            i = i + 1

        if is_train:
            e = out.bind(ctx=default_context(), args=arg_dict, args_grad=arg_grad_dict)
        else:
            e = out.bind(ctx=default_context(), args=arg_dict)
        # the inputs to forward and backward are the same so forward and backward
        # should always return the same outputs.
        for i in range(num_iters):
            e.forward(is_train=is_train)
            if (is_train):
                # backward
                tmp_grads = out_grads[0][:]
                tmp_grads.extend(out_grads[1])
                e.backward(tmp_grads)

        # Below we use imperative to reimplement foreach and compute its gradients.
        res = []
        for i in range(len(_as_list(out_grads[0]))):
            res.append([])
        for arr in _as_list(in_arrs):
            arr.attach_grad()
        for arr in init_states:
            arr.attach_grad()
        for arr in frees:
            arr.attach_grad()
        with mx.autograd.record():
            frees_imp = frees if free_vars_func is None else free_vars_func(frees)
            step_imp = lambda in_arrs, state_arrs : step(in_arrs, state_arrs, frees_imp)
            states = [mx.nd.expand_dims(s, 0) for s in init_states]
            res, states = mx.nd.contrib.foreach(step_imp, in_arrs, init_states)

            res2 = _as_list(res)
            for i in range(len(res2)):
                res2[i] = res2[i] * 2
            outs = []
            outs[:] = res2[:]
            if isinstance(states, list):
                outs.extend(states)
                states = [mx.nd.expand_dims(s, 0) for s in states]
                res2.extend(states)
            else:
                outs.append(states)
                states = mx.nd.expand_dims(states, 0)
                res2.append(states)
            if is_train:
                res = mx.nd.concat(*res2, dim=0)

        tmp_grads = out_grads[0][:]
        tmp_grads1 = [mx.nd.expand_dims(grad, 0) for grad in out_grads[1]]
        tmp_grads.extend(tmp_grads1)
        if is_train:
            res.backward(mx.nd.concat(*tmp_grads, dim=0))
        for i in range(len(outs)):
            assert e.outputs[i].shape == outs[i].shape
            assert_almost_equal(e.outputs[i].asnumpy(), outs[i].asnumpy(),
                    rtol=1e-3, atol=1e-3)
        if (is_train):
            all_ins = _as_list(in_arrs)[:]
            all_ins.extend(init_states)
            all_ins.extend(frees)
            size = min(len(all_ins), len(e.grad_arrays))
            for i in range(size):
                assert_almost_equal(all_ins[i].grad.asnumpy(),
                        e.grad_arrays[i].asnumpy(),
                        rtol=1e-3, atol=1e-3)

    # Test cases:
    # * graph inputs are stored in different orders.
    #   This is to test if foreach finds the data arrays and weight arrays
    #   in the right location.
    # * the number of iterations: odd or even.
    # * multiple inputs and multiple outputs.
    # * inference.
    def step1(in1, states, free):
        out = in1 * 2 + states[0] + free[0]
        return (out, [out])
    frees1 = [mx.nd.arange(2), mx.nd.arange(2) + 1]
    arrs = mx.nd.arange(6).reshape(shape=(3, 2))
    states = [mx.nd.arange(2)]
    out_grads = [[mx.nd.random.uniform(-10, 10, arrs.shape)],
            [mx.nd.random.uniform(-10, 10, states[0].shape)]]
    verify_foreach(step1, v3, [v4], [v5 + v6], arrs, states, frees1, out_grads, True,
            lambda frees : [frees[0] + frees[1]])
    verify_foreach(step1, v3, [v4], [v5 + v6], arrs, states, frees1, out_grads, False,
            lambda frees : [frees[0] + frees[1]])
    verify_foreach(step1, v3, [v4], [v5 + v6], arrs, states, frees1, out_grads, True,
            lambda frees : [frees[0] + frees[1]], 5)
    verify_foreach(step1, v3, [v4], [v5 + v6], arrs, states, frees1, out_grads, False,
            lambda frees : [frees[0] + frees[1]], 5)

    # Test the even number of iterations.
    frees = [mx.nd.random.uniform(shape=(2))]
    arrs = mx.nd.random.uniform(shape=(2, 2))
    out_grads = [[mx.nd.random.uniform(-10, 10, arrs.shape)],
            [mx.nd.random.uniform(-10, 10, states[0].shape)]]
    verify_foreach(step1, v3, [v4], [v5], arrs, states, frees, out_grads)
    verify_foreach(step1, v3, [v4], [v5], arrs, states, frees, out_grads, False)
    # Test the odd number of iterations
    arrs = mx.nd.random.uniform(shape=(3, 2))
    out_grads = [[mx.nd.random.uniform(-10, 10, arrs.shape)],
            [mx.nd.random.uniform(-10, 10, states[0].shape)]]
    verify_foreach(step1, v3, [v4], [v5], arrs, states, frees, out_grads)
    verify_foreach(step1, v3, [v4], [v5], arrs, states, frees, out_grads, False)

    # Reorder the input and state in the subgraph inputs.
    def step2(in1, states, free):
        out = states[0] + in1 * 2 + free[0]
        return (out, [out])
    # Test the even number of iterations.
    arrs = mx.nd.random.uniform(shape=(2, 2))
    out_grads = [[mx.nd.random.uniform(-10, 10, arrs.shape)],
            [mx.nd.random.uniform(-10, 10, states[0].shape)]]
    verify_foreach(step2, v3, [v4], [v5], arrs, states, frees, out_grads)
    verify_foreach(step2, v3, [v4], [v5], arrs, states, frees, out_grads, False)
    # Test the odd number of iterations.
    arrs = mx.nd.random.uniform(shape=(3, 2))
    out_grads = [[mx.nd.random.uniform(-10, 10, arrs.shape)],
            [mx.nd.random.uniform(-10, 10, states[0].shape)]]
    verify_foreach(step2, v3, [v4], [v5], arrs, states, frees, out_grads)
    verify_foreach(step2, v3, [v4], [v5], arrs, states, frees, out_grads, False)

    # Test multiple inputs and outputs.
    def step3(in1, states, free):
        out = in1[0] + in1[1] * 2 + states[0] + states[1] * 2 + free[0]
        return ([out, out], [out * 2, out * 3])
    arrs = [mx.nd.random.uniform(shape=(3, 2)), mx.nd.random.uniform(shape=(3, 2))]
    states = [mx.nd.random.uniform(shape=(2)), mx.nd.random.uniform(shape=(2))]
    out_grads = [[mx.nd.random.uniform(-10, 10, arrs[0].shape), mx.nd.random.uniform(-10, 10, arrs[1].shape)],
            [mx.nd.random.uniform(-10, 10, states[0].shape), mx.nd.random.uniform(-10, 10, states[1].shape)]]
    verify_foreach(step3, [v3, v4], [v5, v6], [v7], arrs, states, frees, out_grads)
    verify_foreach(step3, [v3, v4], [v5, v6], [v7], arrs, states, frees, out_grads, False)

    # Test multiple inputs and outputs.
    # The order of subgraph inputs doesn't match the operator inputs
    def step4(in1, states, free):
        out = in1[1] * 2 + states[0] + free[0] + states[1] * 2 + in1[0]
        return ([out, out * 2], [out * 2, out * 3])
    arrs = [mx.nd.random.uniform(shape=(3, 2)), mx.nd.random.uniform(shape=(3, 2))]
    states = [mx.nd.random.uniform(shape=(2)), mx.nd.random.uniform(shape=(2))]
    out_grads = [[mx.nd.random.uniform(-10, 10, arrs[0].shape), mx.nd.random.uniform(-10, 10, arrs[1].shape)],
            [mx.nd.random.uniform(-10, 10, states[0].shape), mx.nd.random.uniform(-10, 10, states[1].shape)]]
    verify_foreach(step4, [v3, v4], [v5, v6], [v7], arrs, states, frees, out_grads)
    verify_foreach(step4, [v3, v4], [v5, v6], [v7], arrs, states, frees, out_grads, False)

    # Test multiple inputs and outputs.
    # The data inputs and states have different shapes.
    def step5(in1, states, free):
        if isinstance(in1[0], mx.nd.NDArray):
            out1 = mx.nd.broadcast_add(states[0] + free[1], in1[1] * 2)
            out2 = mx.nd.broadcast_add(in1[0], free[0] + states[1] * 2)
        else:
            out1 = mx.sym.broadcast_add(states[0] + free[1], in1[1] * 2)
            out2 = mx.sym.broadcast_add(in1[0], free[0] + states[1] * 2)
        return ([out1, out2 * 2], [states[0] * 2, states[1] * 3])
    frees = [mx.nd.random.uniform(shape=(2)), mx.nd.random.uniform(shape=(2, 2))]
    arrs = [mx.nd.random.uniform(shape=(3, 2, 2)), mx.nd.random.uniform(shape=(3, 2))]
    states = [mx.nd.random.uniform(shape=(2, 2)), mx.nd.random.uniform(shape=(2))]
    out_grads = [[mx.nd.random.uniform(-10, 10, arrs[0].shape), mx.nd.random.uniform(-10, 10, arrs[0].shape)],
            [mx.nd.random.uniform(-10, 10, states[0].shape), mx.nd.random.uniform(-10, 10, states[1].shape)]]
    verify_foreach(step5, [v3, v4], [v5, v6], [v7, v8], arrs, states, frees, out_grads, False)

    # Test multiple inputs and outputs.
    # The data inputs and states have different shapes and data types.
    def step6(in1, states, free):
        if isinstance(in1[0], mx.nd.NDArray):
            out1 = mx.nd.broadcast_add(states[0] + mx.nd.cast(free[1], 'float32'),
                    mx.nd.cast(in1[1], 'float32') * 2)
            out2 = mx.nd.broadcast_add(in1[0],
                    free[0] + mx.nd.cast(states[1], 'float32') * 2)
        else:
            out1 = mx.sym.broadcast_add(states[0] + mx.sym.cast(free[1], 'float32'),
                    mx.sym.cast(in1[1], 'float32') * 2)
            out2 = mx.sym.broadcast_add(in1[0],
                    free[0] + mx.sym.cast(states[1], 'float32') * 2)
        return ([out1, out2 * 2], [states[0] * 2, states[1] * 3])
    frees = [mx.nd.random.uniform(shape=(2)),
            mx.nd.cast(mx.nd.random.uniform(shape=(2, 2)), 'float64')]
    arrs = [mx.nd.random.uniform(shape=(3, 2, 2)),
            mx.nd.cast(mx.nd.random.uniform(shape=(3, 2)), dtype='float16')]
    states = [mx.nd.random.uniform(shape=(2, 2)),
            mx.nd.cast(mx.nd.random.uniform(shape=(2)), dtype='int32')]
    out_grads = [[mx.nd.random.uniform(-10, 10, arrs[0].shape), mx.nd.random.uniform(-10, 10, arrs[0].shape)],
            [mx.nd.random.uniform(-10, 10, states[0].shape), mx.nd.random.uniform(-10, 10, states[1].shape)]]
    verify_foreach(step6, [v3, v4], [v5, v6], [v7, v8], arrs, states, frees, out_grads, False)

    # Test multiple inputs and outputs.
    # some of the inputs are used twice.
    def step7(in1, states, free):
        out1 = states[0] + in1[0] + free[1] + in1[1] * 2 + free[0]
        out2 = in1[0] + free[0] + states[1] * 2 + in1[1]
        return ([out1, out2 * 2], [states[0] * 2, states[1] * 3])
    frees = [mx.nd.random.uniform(shape=(2)), mx.nd.random.uniform(shape=(2))]
    arrs = [mx.nd.random.uniform(shape=(3, 2)), mx.nd.random.uniform(shape=(3, 2))]
    states = [mx.nd.random.uniform(shape=(2)), mx.nd.random.uniform(shape=(2))]
    out_grads = [[mx.nd.random.uniform(-10, 10, arrs[0].shape), mx.nd.random.uniform(-10, 10, arrs[0].shape)],
            [mx.nd.random.uniform(-10, 10, states[0].shape), mx.nd.random.uniform(-10, 10, states[1].shape)]]
    verify_foreach(step7, [v3, v4], [v5, v6], [v7, v8], arrs, states, frees, out_grads, False)

    # Test the case that the output is the input.
    arrs = mx.nd.random.uniform(shape=(3, 2))
    states = [mx.nd.arange(2)]
    frees = [mx.nd.random.uniform(shape=(2))]
    out_grads = [[mx.nd.random.uniform(-10, 10, arrs.shape)],
            [mx.nd.random.uniform(-10, 10, states[0].shape)]]
    def step8(in1, states, free):
        return (in1, [states[0] * free[0]])
    verify_foreach(step8, v3, [v4], [v5], arrs, states, frees, out_grads)
    verify_foreach(step8, v3, [v4], [v5], arrs, states, frees, out_grads, False)
    def step9(in1, states, free):
        return (in1 * free[0], states)
    verify_foreach(step9, v3, [v4], [v5], arrs, states, frees, out_grads)
    verify_foreach(step9, v3, [v4], [v5], arrs, states, frees, out_grads, False)

    # Test the case that not all inputs are used.
    def step10(in1, states, free):
        return (in1, states)
    verify_foreach(step10, v3, [v4], [v5], arrs, states, frees, out_grads)
    verify_foreach(step10, v3, [v4], [v5], arrs, states, frees, out_grads, False)
    def step11(in1, states, free):
        return (in1, free)
    try:
        verify_foreach(step11, v3, [v4], [v5], arrs, states, frees, out_grads)
        verify_foreach(step11, v3, [v4], [v5], arrs, states, frees, out_grads, False)
    except AssertionError:
        print("the states have to be used")
    def step12(in1, states, free):
        return (in1, [states[0] + 1, states[0] + 2])
    states = [mx.nd.random.uniform(shape=(2)), mx.nd.random.uniform(shape=(2))]
    frees = []
    try:
        verify_foreach(step12, v3, [v4, v5], [], arrs, states, frees, out_grads)
        verify_foreach(step12, v3, [v4, v5], [], arrs, states, frees, out_grads, False)
    except AssertionError:
        print("the states have to be used")

    # test without free variables.
    def step13(in1, states, free):
        return (in1, states)
    states = [mx.nd.random.uniform(shape=(2))]
    verify_foreach(step13, v3, [v4], [], arrs, states, [], out_grads)
    verify_foreach(step13, v3, [v4], [], arrs, states, [], out_grads, False)

    # test when there isn't output data or output states.
    def step14(in1, states, free):
        return (in1 + free[0], [])
    frees = [mx.nd.random.uniform(shape=(2))]
    verify_foreach(step14, v3, [], [v4], arrs, [], frees, out_grads)
    verify_foreach(step14, v3, [], [v4], arrs, [], frees, out_grads, False)
    def step15(in1, states, free):
        return ([], [in1 * states[0] * free[0]])
    out_grads = [[], [mx.nd.random.uniform(-10, 10, states[0].shape)]]
    verify_foreach(step15, v3, [v4], [v5], arrs, states, frees, out_grads)
    verify_foreach(step15, v3, [v4], [v5], arrs, states, frees, out_grads, False)

    # Test the case of iterating on a 1D data array.
    def step16(in1, states, free):
        return ([in1[0] * states[0]], [states[0] * 2])
    arrs = [mx.nd.arange(3)]
    states = [mx.nd.random.uniform(shape=(1))]
    out_grads = [[mx.nd.random.uniform(-10, 10, (3, 1))],
            [mx.nd.random.uniform(-10, 10, (1))]]
    verify_foreach(step16, [v3], [v4], [], arrs, states, [], out_grads)
    verify_foreach(step16, [v3], [v4], [], arrs, states, [], out_grads, False)
    def step17(in1, states, free):
        return ([in1[1] * in1[0] * states[0]], [states[0] * 2])
    arrs = [mx.nd.random.uniform(shape=(3, 1)), mx.nd.arange(3)]
    states = [mx.nd.random.uniform(shape=(1))]
    out_grads = [[mx.nd.random.uniform(-10, 10, (3, 1))],
            [mx.nd.random.uniform(-10, 10, (1))]]
    verify_foreach(step17, [v3, v4], [v5], [], arrs, states, [], out_grads)
    verify_foreach(step17, [v3, v4], [v5], [], arrs, states, [], out_grads, False)


@with_seed()
def test_foreach_nested():
    # Test nested foreach.
    def step_in(in1, states):
        out = in1 * 2 + states[0]
        return (out, [out])

    def step_sym(in1, states):
        out1 = mx.sym.contrib.foreach(step_in, in1, states)
        out = mx.sym.broadcast_add(out1[0], states[0])
        return (out, [mx.sym.squeeze(mx.sym.slice(out, begin=(0, 0), end=(1, 2)))])
    def step_nd(in1, states):
        out1 = mx.nd.contrib.foreach(step_in, in1, states)
        out = mx.nd.broadcast_add(out1[0], states[0])
        return (out, [mx.nd.squeeze(mx.nd.slice(out, begin=(0, 0), end=(1, 2)))])

    data_sym = mx.sym.var("v1")
    state_sym = mx.sym.var("v2")
    out, states = mx.sym.contrib.foreach(step_sym, data_sym, [state_sym])
    assert isinstance(states, list)
    assert len(states) == 1
    out = mx.sym.broadcast_add(out, states[0])

    js_1 = out.tojson()
    out = mx.sym.load_json(js_1)
    js_2 = out.tojson()
    assert js_1 == js_2

    data = mx.nd.arange(8).reshape((2, 2, 2))
    state = mx.nd.arange(2)
    data_grad = mx.nd.empty(data.shape)
    state_grad = mx.nd.empty(state.shape)
    e = out.bind(ctx=default_context(), args={'v1':data, 'v2':state},
            args_grad={'v1':data_grad, 'v2':state_grad})
    e.forward(is_train=True)
    out_grads = []
    for out in e.outputs:
        out_grads.append(mx.nd.random.uniform(shape=out.shape))
    e.backward(out_grads)

    data.attach_grad()
    state.attach_grad()
    with mx.autograd.record():
        out, states = mx.nd.contrib.foreach(step_nd, data, [state])
        assert isinstance(states, list)
        assert len(states) == 1
        res = mx.nd.broadcast_add(out, states[0])
    assert_almost_equal(res.asnumpy(), e.outputs[0].asnumpy(), rtol=1e-3, atol=1e-3)

    res.backward(out_grads[0])
    assert_almost_equal(data.grad.asnumpy(), data_grad.asnumpy(), rtol=1e-3, atol=1e-3)
    assert_almost_equal(state.grad.asnumpy(), state_grad.asnumpy(), rtol=1e-3, atol=1e-3)


def check_foreach_rnn(cell_type, num_states):
    data = mx.sym.var("data")
    params = mx.rnn.RNNParams()
    hidden_dim = 4
    input_dim = 5
    seq_len = 2
    batch_size = 2

    # This tests foreach with accumulation sum.
    def step(in1, states):
        rnn = cell_type(hidden_dim, prefix='', params=params)
        next_h, states = rnn(in1, states)
        return (next_h, states)

    def sym_group(out):
        if (isinstance(out[0], mx.sym.Symbol)):
            ret = [out[0]]
        else:
            ret = out[0]
        ret.extend(out[1])
        return mx.sym.Group(ret)

    rnn = cell_type(hidden_dim, prefix='', params=params)
    if num_states == 2:
        init_states = [mx.sym.var("h"), mx.sym.var("c")]
    else:
        init_states = [mx.sym.var("h")]
    out = mx.sym.contrib.foreach(step, data, init_states)
    out = sym_group(out)
    arg_shapes, out_shapes, aux_shapes = out.infer_shape(data=(seq_len, batch_size, input_dim),
            h=(batch_size, hidden_dim))
    rnn_inputs = out.list_inputs()

    # Inputs
    args1 = {name:mx.nd.random.uniform(shape=arg_shapes[i]) for i, name in enumerate(rnn_inputs)}
    args2 = copy.deepcopy(args1)
    # gradients for the backward of the foreach symbol
    args_grad1 = {name:mx.nd.empty(shape=arg_shapes[i]) for i, name in enumerate(rnn_inputs)}
    # gradients for the backward of the unrolled symbol.
    args_grad2 = {name:mx.nd.empty(shape=arg_shapes[i]) for i, name in enumerate(rnn_inputs)}

    # Symbol of running LSTM with foreach.
    out = mx.sym.contrib.foreach(step, data, init_states)
    out = sym_group(out)
    js_1 = out.tojson()
    out = mx.sym.load_json(js_1)
    js_2 = out.tojson()
    assert js_1 == js_2
    e1 = out.bind(ctx=default_context(), args=args1, args_grad=args_grad1)

    # Symbol of running unrolled LSTM.
    lstm = cell_type(hidden_dim, prefix='')
    unroll_outs = []
    states = init_states
    for inputs in mx.sym.split(data, num_outputs=seq_len, axis=0, squeeze_axis=True):
        h, states = lstm(inputs, states)
        unroll_outs.append(mx.sym.expand_dims(h, axis=0))
    unroll_outs = _as_list(mx.sym.concat(*unroll_outs, dim=0))
    unroll_outs.extend(states)
    out = mx.sym.Group(unroll_outs)
    js_1 = out.tojson()
    out = mx.sym.load_json(js_1)
    js_2 = out.tojson()
    assert js_1 == js_2
    e2 = out.bind(ctx=default_context(), args=args2, args_grad=args_grad2)

    for i in range(5):
        out_grads = []
        for arr in e1.outputs:
            out_grads.append(mx.nd.random.uniform(-10, 10, arr.shape))

        args = {name:mx.nd.random.uniform(shape=arg_shapes[i]) for i, name in enumerate(rnn_inputs)}

        e1.forward(is_train=True, **args)
        outputs1 = e1.outputs
        e1.backward(out_grads)

        e2.forward(is_train=True, **args)
        outputs2 = e2.outputs
        e2.backward(out_grads)

        for i in range(len(outputs2)):
            assert_almost_equal(outputs1[i].asnumpy(), outputs2[i].asnumpy(),
                    rtol=1e-3, atol=1e-3)
        input_names = out.list_inputs()
        for i in range(len(e1.grad_arrays)):
            name = input_names[i]
            assert_almost_equal(args_grad1[name].asnumpy(), args_grad2[name].asnumpy(),
                    rtol=1e-3, atol=1e-3)


@with_seed()
def test_foreach_rnn():
    cell_types = [(mx.rnn.LSTMCell, 2), (mx.rnn.RNNCell, 1), (mx.rnn.GRUCell, 1)]
    for cell_type, num_states in cell_types:
        check_foreach_rnn(cell_type, num_states)


@with_seed()
def test_cut_subgraph_foreach():
    class TestLayer(gluon.HybridBlock):
        def __init__(self, prefix=None, params=None):
            super(TestLayer, self).__init__(prefix=prefix, params=params)

        def hybrid_forward(self, F, inputs, states):
            def step1(data, states):
                return data + 1, states
            out1, states1 = F.contrib.foreach(step1, inputs, states)
            out2, states2 = F.contrib.foreach(step1, out1, states)
            def step2(data, states):
                return data + states[0], states1
            out, states = F.contrib.foreach(step2, out2, states)
            return out

    data = mx.nd.normal(loc=0, scale=1, shape=(5, 10))
    states = mx.nd.normal(loc=0, scale=1, shape=(10))
    layer = TestLayer()
    layer.initialize(ctx=default_context())
    res1 = layer(data, [states])

    with mx.autograd.record():
        res1 = layer(data, [states])

    layer = TestLayer()
    layer.initialize(ctx=default_context())
    layer.hybridize()
    res2 = layer(data, [states])

    with mx.autograd.record():
        res2 = layer(data, [states])
    assert_almost_equal(res1.asnumpy(), res2.asnumpy(), rtol=1e-3, atol=1e-3)


@with_seed()
def test_uniq_name():
    class ForeachLayer1(gluon.HybridBlock):
        def __init__(self, prefix=None, params=None):
            super(ForeachLayer1, self).__init__(prefix=prefix, params=params)

        def hybrid_forward(self, F, inputs, states):
            def step1(data, states):
                return data + 1, states
            out1, states1 = F.contrib.foreach(step1, inputs, states)
            # The input variables have the same symbol name.
            out, states = F.contrib.foreach(step1, out1, states1)
            return out

    class ForeachLayer2(gluon.HybridBlock):
        def __init__(self, prefix=None, params=None):
            super(ForeachLayer2, self).__init__(prefix=prefix, params=params)

        def hybrid_forward(self, F, inputs, states):
            def step1(data, states):
                return data + 1, states
            out1, states1 = F.contrib.foreach(step1, inputs, states)
            def step2(data, states):
                return data, [states[0] + states1[0] + F.squeeze(out1.slice_axis(axis=0, begin=0, end=1))]
            # The input variables have the same symbol names.
            # The free variables have the same symbol names as the input variables.
            out, states = F.contrib.foreach(step2, out1, states1)
            return out

    class WhileLayer1(gluon.HybridBlock):
        def __init__(self, prefix=None, params=None):
            super(WhileLayer1, self).__init__(prefix=prefix, params=params)

        def hybrid_forward(self, F, inputs, states):
            def cond(state1, state2):
                s = F.squeeze(state1.slice_axis(axis=0, begin=0, end=1))
                return s == s
            def step(state1, state2):
                return state1 + 1, [state1, state2]
            states = [states[0], states[0] + 1]
            out1, states1 = F.contrib.while_loop(cond, step, states, max_iterations=5)
            # The input variables have the same symbol name.
            out, states = F.contrib.while_loop(cond, step, states1, max_iterations=5)
            return out

    class WhileLayer2(gluon.HybridBlock):
        def __init__(self, prefix=None, params=None):
            super(WhileLayer2, self).__init__(prefix=prefix, params=params)

        def hybrid_forward(self, F, inputs, states):
            def cond(state1, state2):
                s = F.squeeze(state1.slice_axis(axis=0, begin=0, end=1))
                return s == s
            def step1(state1, state2):
                return state1 + 1, [state1, state2]
            states = [states[0], states[0] + 1]
            out1, states1 = F.contrib.while_loop(cond, step1, states, max_iterations=5)
            def step2(state1, state2):
                return state1 + 1, [state1 + states1[0], state2 + states1[1]]
            # The input variables have the same symbol name.
            out, states = F.contrib.while_loop(cond, step2, states1, max_iterations=5)
            return out

    TestLayers = [ForeachLayer1, ForeachLayer2,
            WhileLayer1, WhileLayer2]

    data = mx.nd.normal(loc=0, scale=1, shape=(2, 5))
    states = mx.nd.normal(loc=0, scale=1, shape=(5))
    for TestLayer in TestLayers:
        layer = TestLayer()
        layer.initialize(ctx=default_context())
        res1 = layer(data, [states])

        with mx.autograd.record():
            res1 = layer(data, [states])

        layer = TestLayer()
        layer.initialize(ctx=default_context())
        layer.hybridize()
        res2 = layer(data, [states])

        with mx.autograd.record():
            res2 = layer(data, [states])
        assert_almost_equal(res1.asnumpy(), res2.asnumpy(), rtol=0.001, atol=0.0001)


@with_seed()
def test_cut_subgraph_while_loop():
    class TestLayer(gluon.HybridBlock):
        def __init__(self, prefix=None, params=None):
            super(TestLayer, self).__init__(prefix=prefix, params=params)
        def hybrid_forward(self, F, data):
            out1, data1 = F.contrib.while_loop(
                cond=lambda i: i <= 5,
                func=lambda i: (None, (i + 1, )),
                loop_vars=(data, ),
                max_iterations=10,
            )
            out2, data2 = F.contrib.while_loop(
                cond=lambda i: data1[0],
                func=lambda i: (None, (i + 1, )),
                loop_vars=data1[0],
                max_iterations=10,
            )
            return data2[0]
    data = mx.nd.normal(loc=0, scale=1, shape=(1, ))
    layer = TestLayer()
    layer.initialize(ctx=default_context())
    res1 = layer(data)
    with mx.autograd.record():
        res1 = layer(data)
    layer = TestLayer()
    layer.initialize(ctx=default_context())
    layer.hybridize()
    res2 = layer(data)
    with mx.autograd.record():
        res2 = layer(data)
    assert_almost_equal(res1.asnumpy(), res2.asnumpy(), rtol=1e-3, atol=1e-3)


@with_seed()
def test_cut_subgraph_cond():
    class TestLayer(gluon.HybridBlock):
        def __init__(self, prefix=None, params=None):
            super(TestLayer, self).__init__(prefix=prefix, params=params)
        def hybrid_forward(self, F, data):
            data1 = F.contrib.cond(
                data > 0.5,
                then_func=lambda: data * 2,
                else_func=lambda: data * 3,
            )
            data2 = F.contrib.cond(
                data1 > 0.5,
                then_func=lambda: data1 * 2,
                else_func=lambda: data1 * 3,
            )
            return data2
    data = mx.nd.normal(loc=0, scale=1, shape=(1, ))
    layer = TestLayer()
    layer.initialize(ctx=default_context())
    res1 = layer(data)
    with mx.autograd.record():
        res1 = layer(data)
    layer = TestLayer()
    layer.initialize(ctx=default_context())
    layer.hybridize()
    res2 = layer(data)
    with mx.autograd.record():
        res2 = layer(data)
    assert_almost_equal(res1.asnumpy(), res2.asnumpy(), rtol=1e-3, atol=1e-3)


def test_scope():
    class TestBlock1(gluon.HybridBlock):
        def __init__(self, prefix=None, params=None):
            super(TestBlock1, self).__init__(prefix=prefix, params=params)
        def hybrid_forward(self, F, data):
            (new_data, ) = F.contrib.cond(
                data > 0.5,
                then_func=lambda: data * 2,
                else_func=lambda: data * 3,
                name="my_cond",
            )
            return new_data
    class TestBlock2(gluon.HybridBlock):
        def __init__(self, prefix=None, params=None):
            super(TestBlock2, self).__init__(prefix=prefix, params=params)
        def hybrid_forward(self, F, data):
            (new_data, ) = F.contrib.cond(
                data > 0.5,
                then_func=lambda: data * 2,
                else_func=lambda: data * 3,
                name="my_cond",
            )
            return new_data
    AttrScope._subgraph_names = defaultdict(int)
    data = mx.nd.normal(loc=0, scale=1, shape=(1, ))
    block1 = TestBlock1()
    block1.initialize(ctx=default_context())
    block1.hybridize()
    _ = block1(data)
    block2 = TestBlock2()
    block2.initialize(ctx=default_context())
    block2.hybridize()
    _ = block2(data)
    assert len(AttrScope._subgraph_names) == 3
    assert AttrScope._subgraph_names['my_cond_else'] == 2
    assert AttrScope._subgraph_names['my_cond_pred'] == 2
    assert AttrScope._subgraph_names['my_cond_then'] == 2


def test_output_format_foreach():
    class TestLayer1(gluon.HybridBlock):
        def __init__(self, step, prefix=None, params=None):
            super(TestLayer1, self).__init__(prefix=prefix, params=params)
            self.step = step
        def hybrid_forward(self, F, ins, states):
            out, states = F.contrib.foreach(self.step, ins, states)
            return out, states

    def step1(data, state):
        return data, state
    def step2(data, state):
        return [data], state
    def step3(data, state):
        if isinstance(state, list):
            return [], [state[0] + data]
        else:
            return [], state + data
    def step4(data, state):
        if isinstance(state, list):
            return [data, state[0]], state
        else:
            return [data, state], state

    steps = [step1, step2, step3, step4]
    data = mx.nd.normal(loc=0, scale=1, shape=(10, 2))
    state = mx.nd.normal(loc=0, scale=1, shape=(2))
    for step in steps:
        layer1 = TestLayer1(step)
        layer1.initialize(ctx=default_context())
        layer2 = TestLayer1(step)
        layer2.initialize(ctx=default_context())
        layer2.hybridize()
        out1, state1 = layer1(data, [state])
        out2, state2 = layer2(data, [state])
        step_out, step_state = step(data, [state])
        assert type(out1) == type(step_out)
        assert type(out2) == type(step_out)
        assert type(state1) == type(step_state)
        assert type(state2) == type(step_state)
        out1 = _as_list(out1)
        out2 = _as_list(out2)
        state1 = _as_list(state1)
        state2 = _as_list(state2)
        for i in range(len(out1)):
            assert_almost_equal(out1[i].asnumpy(), out2[i].asnumpy(), rtol=0.001, atol=0.0001)
        for i in range(len(state1)):
            assert_almost_equal(state1[i].asnumpy(), state2[i].asnumpy(), rtol=0.001, atol=0.0001)

        layer1 = TestLayer1(step)
        layer1.initialize(ctx=default_context())
        layer2 = TestLayer1(step)
        layer2.initialize(ctx=default_context())
        layer2.hybridize()
        out1, state1 = layer1(data, state)
        out2, state2 = layer2(data, state)
        step_out, step_state = step(data, state)
        assert type(out1) == type(step_out)
        assert type(out2) == type(step_out)
        assert type(state1) == type(step_state)
        assert type(state2) == type(step_state)
        out1 = _as_list(out1)
        out2 = _as_list(out2)
        state1 = _as_list(state1)
        state2 = _as_list(state2)
        for i in range(len(out1)):
            assert_almost_equal(out1[i].asnumpy(), out2[i].asnumpy(), rtol=0.001, atol=0.0001)
        for i in range(len(state1)):
            assert_almost_equal(state1[i].asnumpy(), state2[i].asnumpy(), rtol=0.001, atol=0.0001)

        if step == step3:
            continue
        layer1 = TestLayer1(step)
        layer1.initialize(ctx=default_context())
        layer2 = TestLayer1(step)
        layer2.initialize(ctx=default_context())
        layer2.hybridize()
        out1, state1 = layer1(data, [state, [state + 1]])
        out2, state2 = layer2(data, [state, [state + 1]])
        step_out, step_state = step(data, [state, [state + 1]])
        assert type(out1) == type(step_out)
        assert type(out2) == type(step_out)
        assert type(state1) == type(step_state)
        assert type(state2) == type(step_state)
        out1 = _as_list(out1)
        out2 = _as_list(out2)
        state1 = _as_list(state1)
        state2 = _as_list(state2)
        for i in range(len(out1)):
            assert_almost_equal(out1[i].asnumpy(), out2[i].asnumpy(), rtol=0.001, atol=0.0001)
        for i in range(len(state1)):
            if isinstance(state1[i], list):
                assert_almost_equal(state1[i][0].asnumpy(), state2[i][0].asnumpy(),
                        rtol=0.001, atol=0.0001)
            else:
                assert_almost_equal(state1[i].asnumpy(), state2[i].asnumpy(),
                        rtol=0.001, atol=0.0001)


def test_output_format_while():
    class TestLayer1(gluon.HybridBlock):
        def __init__(self, step, use_list, nested_list=False, prefix=None, params=None):
            super(TestLayer1, self).__init__(prefix=prefix, params=params)
            self.step = step
            self.use_list = use_list
            self.nested_list = nested_list
        def hybrid_forward(self, F, states):
            def cond(state1):
                scalar = state1.slice_axis(axis=0, begin=0, end=1)
                return scalar == scalar
            cond_func = cond
            if self.use_list:
                states = [states]
            elif self.nested_list:
                def cond2(state1, state2):
                    scalar = state1.slice_axis(axis=0, begin=0, end=1)
                    return scalar == scalar
                cond_func = cond2
                states = [states, [states + 1]]
            out, states = F.contrib.while_loop(cond_func, self.step, states, max_iterations=5)
            return out, states

    def step1(state):
        return state, state
    def step2(state):
        if isinstance(state, list):
            return state, state
        else:
            return [state], state
    def step3(state):
        return [], state

    steps = [step1, step2, step3]
    state = mx.nd.normal(loc=0, scale=1, shape=(2))
    for step in steps:
        layer1 = TestLayer1(step, False)
        layer1.initialize(ctx=default_context())
        layer2 = TestLayer1(step, False)
        layer2.initialize(ctx=default_context())
        layer2.hybridize()
        out1, state1 = layer1(state)
        out2, state2 = layer2(state)
        assert type(out1) == type(out2)
        assert type(state1) == type(state1)
        out1 = _as_list(out1)
        out2 = _as_list(out2)
        state1 = _as_list(state1)
        state2 = _as_list(state2)
        for i in range(len(out1)):
            assert_almost_equal(out1[i].asnumpy(), out2[i].asnumpy(), rtol=0.001, atol=0.0001)
        for i in range(len(state1)):
            assert_almost_equal(state1[i].asnumpy(), state2[i].asnumpy(), rtol=0.001, atol=0.0001)

        layer1 = TestLayer1(step, True)
        layer1.initialize(ctx=default_context())
        layer2 = TestLayer1(step, True)
        layer2.initialize(ctx=default_context())
        layer2.hybridize()
        out1, state1 = layer1(state)
        out2, state2 = layer2(state)
        assert type(out1) == type(out2)
        assert type(state1) == type(state2)
        out1 = _as_list(out1)
        out2 = _as_list(out2)
        state1 = _as_list(state1)
        state2 = _as_list(state2)
        for i in range(len(out1)):
            assert_almost_equal(out1[i].asnumpy(), out2[i].asnumpy(), rtol=0.001, atol=0.0001)
        for i in range(len(state1)):
            assert_almost_equal(state1[i].asnumpy(), state2[i].asnumpy(), rtol=0.001, atol=0.0001)

    def step4(state, state2):
        states = _as_list(state)
        states.append(state2)
        return state, states
    def step5(state, state2):
        states = _as_list(state)
        states.append(state2)
        if isinstance(state, list):
            return state, states
        else:
            return [state], states
    def step6(state, state2):
        states = _as_list(state)
        states.append(state2)
        return [], states

    steps = [step4, step5, step6]
    for step in steps:
        layer1 = TestLayer1(step, False, True)
        layer1.initialize(ctx=default_context())
        layer2 = TestLayer1(step, False, True)
        layer2.initialize(ctx=default_context())
        layer2.hybridize()
        out1, state1 = layer1(state)
        out2, state2 = layer2(state)
        assert type(out1) == type(out2)
        assert type(state1) == type(state2)
        out1 = _as_list(out1)
        out2 = _as_list(out2)
        state1 = _as_list(state1)
        state2 = _as_list(state2)
        for i in range(len(out1)):
            assert_almost_equal(out1[i].asnumpy(), out2[i].asnumpy(), rtol=0.001, atol=0.0001)
        for i in range(len(state1)):
            if not isinstance(state1[i], list):
                assert_almost_equal(state1[i].asnumpy(), state2[i].asnumpy(),
                                    rtol=0.001, atol=0.0001)


def test_output_format_cond():
    class TestLayer1(gluon.HybridBlock):
        def __init__(self, func, prefix=None, params=None):
            super(TestLayer1, self).__init__(prefix=prefix, params=params)
            self.func = func
        def hybrid_forward(self, F, data):
            def then_func():
                return self.func(data)
            def else_func():
                return self.func(data)
            return F.contrib.cond(data.slice_axis(axis=0, begin=0, end=1),
                    then_func, else_func)

    def func1(data):
        return data
    def func2(data):
        return [data]
    def func3(data):
        return [data, data]

    funcs = [func1, func2, func3]
    data = mx.nd.normal(loc=0, scale=1, shape=(2))
    for func in funcs:
        layer1 = TestLayer1(func)
        layer1.initialize(ctx=default_context())
        layer2 = TestLayer1(func)
        layer2.initialize(ctx=default_context())
        layer2.hybridize()
        out1 = layer1(data)
        out2 = layer2(data)
        func_out = func(data)
        assert type(out1) == type(func_out)
        assert type(out2) == type(func_out)
        out1 = _as_list(out1)
        out2 = _as_list(out2)
        for i in range(len(out1)):
            assert_almost_equal(out1[i].asnumpy(), out2[i].asnumpy(), rtol=0.001, atol=0.0001)

def test_foreach_with_unkown_dim():
    # MXNet supports using 0 as placeholder for unknown dimensions in shape
    step = lambda data, states: (data + states[0], [states[0] * 2])
    # input shape with NCHW format and N is unknown
    data = mx.sym.var('data', shape=(0, 3, 32, 32))
    states = [mx.sym.var('state')]
    outs, states = mx.sym.contrib.foreach(step, data, states)
    _, output_shape, _ = outs.infer_shape_partial()
    assert_allclose((0, 3, 32, 32), output_shape[0])

if __name__ == '__main__':
    import nose
    nose.runmodule()