import numpy as np
import pytest
import xgboost as xgb
from xgboost import testing as tm
from xgboost.testing.parse_tree import (
    run_split_value_histograms,
    run_tree_to_df_categorical,
)

pytestmark = pytest.mark.skipif(**tm.no_pandas())


dpath = "demo/data/"
rng = np.random.RandomState(1994)


class TestTreesToDataFrame:
    def build_model(self, max_depth, num_round):
        dtrain, _ = tm.load_agaricus(__file__)
        param = {"max_depth": max_depth, "objective": "binary:logistic", "verbosity": 1}
        num_round = num_round
        bst = xgb.train(param, dtrain, num_round)
        return bst

    def parse_dumped_model(self, booster, item_to_get, splitter):
        item_to_get += "="
        txt_dump = booster.get_dump(with_stats=True)
        tree_list = [tree.split("/n") for tree in txt_dump]
        split_trees = [tree[0].split(item_to_get)[1:] for tree in tree_list]
        res = sum(
            [float(line.split(splitter)[0]) for tree in split_trees for line in tree]
        )
        return res

    def test_trees_to_dataframe(self):
        bst = self.build_model(max_depth=5, num_round=10)
        gain_from_dump = self.parse_dumped_model(
            booster=bst, item_to_get="gain", splitter=","
        )
        cover_from_dump = self.parse_dumped_model(
            booster=bst, item_to_get="cover", splitter="\n"
        )
        # method being tested
        df = bst.trees_to_dataframe()

        # test for equality of gains
        gain_from_df = df[df.Feature != "Leaf"][["Gain"]].sum()
        assert np.allclose(gain_from_dump, gain_from_df)

        # test for equality of covers
        cover_from_df = df.Cover.sum()
        assert np.allclose(cover_from_dump, cover_from_df)

    def test_tree_to_df_categorical(self) -> None:
        run_tree_to_df_categorical("approx", "cpu")

    def test_tree_to_df_indicator(self, tmp_path) -> None:
        """Test trees_to_dataframe with indicator (boolean) features."""
        n_samples = 200
        n_features = 5
        X_int = rng.randint(0, 2, size=(n_samples, n_features))
        y = np.logical_xor(X_int[:, 0], X_int[:, 1]).astype(np.float32)
        X = X_int.astype(np.float32)
        dtrain = xgb.DMatrix(X, label=y)

        # Create a feature map with indicator type 'i'
        fmap_path = str(tmp_path / "fmap.txt")
        with open(fmap_path, "w", encoding="utf-8") as f:
            for i in range(n_features):
                f.write(f"{i}\tf{i}\ti\n")

        bst = xgb.train(
            {"max_depth": 3, "objective": "binary:logistic", "verbosity": 0},
            dtrain,
            num_boost_round=5,
        )
        df = bst.trees_to_dataframe(fmap=fmap_path)

        # Basic structure checks
        assert "Tree" in df.columns
        assert "Feature" in df.columns
        assert "Gain" in df.columns
        assert "Cover" in df.columns
        assert len(df) > 0

        # Indicator nodes should have NaN splits; missing defaults to no-direction
        non_leaf = df[df.Feature != "Leaf"]
        assert len(non_leaf) > 0
        assert non_leaf["Split"].isna().all()
        assert (non_leaf["Missing"] == non_leaf["No"]).all()

    def test_split_value_histograms(self):
        run_split_value_histograms("approx", "cpu")