{ "cells": [ { "cell_type": "markdown", "id": "52f0d9fa-02c5-4b3f-b999-87051ce902f0", "metadata": {}, "source": [ "### Data manipulation example\n", "\n", "We read in a SPLICE dataset and format it ready for inclusion in our NN model.\n", "\n", "Steps: \n", "\n", "1. Read in the data\n", "2. Create additional variables\n", "3. Apply cutoff\n", "4. Make sure there is one record for every development period\n", "5. Create variables and define which ones you want to use in the model\n", "6. Create train and test datasets" ] }, { "cell_type": "code", "execution_count": 3, "id": "d2c1269e-97d0-445e-b171-a380917a3278", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "from matplotlib import pyplot as plt\n", "import seaborn as sns\n", "\n", "import torch\n", "import torch.nn as nn\n", "from torch.nn import functional as F" ] }, { "cell_type": "code", "execution_count": 4, "id": "d484621e-77b7-47ee-a985-d786efdc3ce0", "metadata": {}, "outputs": [], "source": [ "pd.options.display.float_format = '{:,.2f}'.format" ] }, { "cell_type": "markdown", "id": "8cf5e2d1-b664-4bd3-8713-2ce490a3f904", "metadata": {}, "source": [ "### 1. Read in the data from SPLICE \n", "Here we just use the paid data. Case estimate data is also available in the incurred_1.csv file but we do not use it here" ] }, { "cell_type": "code", "execution_count": 5, "id": "872364f5-54a0-45a7-9f00-cbe1aa072889", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Unnamed: 0claim_nopmt_nooccurrence_periodoccurrence_timeclaim_sizenotidelsetldelpayment_timepayment_periodpayment_sizepayment_inflatedpayment_delay
011110.73232,310.090.6623.215.33613,226.3413,226.343.93
121210.73232,310.090.6623.2110.091115,685.8615,685.864.76
231310.73232,310.090.6623.2118.021914,643.2814,643.287.93
341410.73232,310.090.6623.2122.8223170,041.89170,041.894.79
451510.73232,310.090.6623.2124.612518,712.7118,712.711.79
..........................................
1931719318366324039.8740,653.722.2910.6746.40472,213.882,213.881.84
1931819319366334039.8740,653.722.2910.6748.45492,292.752,292.752.06
1931919320366344039.8740,653.722.2910.6750.24512,367.322,367.321.78
1932019321366354039.8740,653.722.2910.6751.765228,576.0728,576.071.52
1932119322366364039.8740,653.722.2910.6752.84532,585.462,585.461.08
\n", "

19322 rows × 13 columns

\n", "
" ], "text/plain": [ " Unnamed: 0 claim_no pmt_no occurrence_period occurrence_time \\\n", "0 1 1 1 1 0.73 \n", "1 2 1 2 1 0.73 \n", "2 3 1 3 1 0.73 \n", "3 4 1 4 1 0.73 \n", "4 5 1 5 1 0.73 \n", "... ... ... ... ... ... \n", "19317 19318 3663 2 40 39.87 \n", "19318 19319 3663 3 40 39.87 \n", "19319 19320 3663 4 40 39.87 \n", "19320 19321 3663 5 40 39.87 \n", "19321 19322 3663 6 40 39.87 \n", "\n", " claim_size notidel setldel payment_time payment_period \\\n", "0 232,310.09 0.66 23.21 5.33 6 \n", "1 232,310.09 0.66 23.21 10.09 11 \n", "2 232,310.09 0.66 23.21 18.02 19 \n", "3 232,310.09 0.66 23.21 22.82 23 \n", "4 232,310.09 0.66 23.21 24.61 25 \n", "... ... ... ... ... ... \n", "19317 40,653.72 2.29 10.67 46.40 47 \n", "19318 40,653.72 2.29 10.67 48.45 49 \n", "19319 40,653.72 2.29 10.67 50.24 51 \n", "19320 40,653.72 2.29 10.67 51.76 52 \n", "19321 40,653.72 2.29 10.67 52.84 53 \n", "\n", " payment_size payment_inflated payment_delay \n", "0 13,226.34 13,226.34 3.93 \n", "1 15,685.86 15,685.86 4.76 \n", "2 14,643.28 14,643.28 7.93 \n", "3 170,041.89 170,041.89 4.79 \n", "4 18,712.71 18,712.71 1.79 \n", "... ... ... ... \n", "19317 2,213.88 2,213.88 1.84 \n", "19318 2,292.75 2,292.75 2.06 \n", "19319 2,367.32 2,367.32 1.78 \n", "19320 28,576.07 28,576.07 1.52 \n", "19321 2,585.46 2,585.46 1.08 \n", "\n", "[19322 rows x 13 columns]" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "transactions = pd.read_csv(\n", " f\"https://raw.githubusercontent.com/agi-lab/SPLICE/main/datasets/complexity_1/payment_1.csv\"\n", ")\n", "transactions" ] }, { "cell_type": "markdown", "id": "979ad8fb-8a00-4db4-b354-2f0890dd078c", "metadata": {}, "source": [ "### 2. Create additional variables" ] }, { "cell_type": "code", "execution_count": 6, "id": "08aff643-c549-4d87-9de3-8b88c55a1853", "metadata": {}, "outputs": [], "source": [ "transactions[\"noti_period\"] = np.ceil(transactions[\"occurrence_time\"] + transactions[\"notidel\"]).astype('int')\n", "transactions[\"settle_period\"] = np.ceil(transactions[\"occurrence_time\"] + transactions[\"notidel\"] + transactions[\"setldel\"]).astype('int')" ] }, { "cell_type": "markdown", "id": "8061ffce-c89c-40b9-ae8b-a046a715e438", "metadata": {}, "source": [ "### 3. Apply cutoff" ] }, { "cell_type": "code", "execution_count": 8, "id": "1142952c-aab1-4521-bd1c-91c74b575c9d", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Unnamed: 0claim_nopmt_nooccurrence_periodoccurrence_timeclaim_sizenotidelsetldelpayment_timepayment_periodpayment_sizepayment_inflatedpayment_delaynoti_periodsettle_perioddevelopment_period
011110.73232,310.090.6623.215.33613,226.3413,226.343.932255
121210.73232,310.090.6623.2110.091115,685.8615,685.864.7622510
231310.73232,310.090.6623.2118.021914,643.2814,643.287.9322518
341410.73232,310.090.6623.2122.8223170,041.89170,041.894.7922522
451510.73232,310.090.6623.2124.612518,712.7118,712.711.7922524
...................................................
1931719318366324039.8740,653.722.2910.6746.40472,213.882,213.881.8443537
1931819319366334039.8740,653.722.2910.6748.45492,292.752,292.752.0643539
1931919320366344039.8740,653.722.2910.6750.24512,367.322,367.321.78435311
1932019321366354039.8740,653.722.2910.6751.765228,576.0728,576.071.52435312
1932119322366364039.8740,653.722.2910.6752.84532,585.462,585.461.08435313
\n", "

19322 rows × 16 columns

\n", "
" ], "text/plain": [ " Unnamed: 0 claim_no pmt_no occurrence_period occurrence_time \\\n", "0 1 1 1 1 0.73 \n", "1 2 1 2 1 0.73 \n", "2 3 1 3 1 0.73 \n", "3 4 1 4 1 0.73 \n", "4 5 1 5 1 0.73 \n", "... ... ... ... ... ... \n", "19317 19318 3663 2 40 39.87 \n", "19318 19319 3663 3 40 39.87 \n", "19319 19320 3663 4 40 39.87 \n", "19320 19321 3663 5 40 39.87 \n", "19321 19322 3663 6 40 39.87 \n", "\n", " claim_size notidel setldel payment_time payment_period \\\n", "0 232,310.09 0.66 23.21 5.33 6 \n", "1 232,310.09 0.66 23.21 10.09 11 \n", "2 232,310.09 0.66 23.21 18.02 19 \n", "3 232,310.09 0.66 23.21 22.82 23 \n", "4 232,310.09 0.66 23.21 24.61 25 \n", "... ... ... ... ... ... \n", "19317 40,653.72 2.29 10.67 46.40 47 \n", "19318 40,653.72 2.29 10.67 48.45 49 \n", "19319 40,653.72 2.29 10.67 50.24 51 \n", "19320 40,653.72 2.29 10.67 51.76 52 \n", "19321 40,653.72 2.29 10.67 52.84 53 \n", "\n", " payment_size payment_inflated payment_delay noti_period \\\n", "0 13,226.34 13,226.34 3.93 2 \n", "1 15,685.86 15,685.86 4.76 2 \n", "2 14,643.28 14,643.28 7.93 2 \n", "3 170,041.89 170,041.89 4.79 2 \n", "4 18,712.71 18,712.71 1.79 2 \n", "... ... ... ... ... \n", "19317 2,213.88 2,213.88 1.84 43 \n", "19318 2,292.75 2,292.75 2.06 43 \n", "19319 2,367.32 2,367.32 1.78 43 \n", "19320 28,576.07 28,576.07 1.52 43 \n", "19321 2,585.46 2,585.46 1.08 43 \n", "\n", " settle_period development_period \n", "0 25 5 \n", "1 25 10 \n", "2 25 18 \n", "3 25 22 \n", "4 25 24 \n", "... ... ... \n", "19317 53 7 \n", "19318 53 9 \n", "19319 53 11 \n", "19320 53 12 \n", "19321 53 13 \n", "\n", "[19322 rows x 16 columns]" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Apply a maximum development period\n", "maxdev=40\n", "\n", "transactions[\"development_period\"] = np.minimum(transactions[\"payment_period\"] - transactions[\"occurrence_period\"], maxdev) \n", "\n", "transactions" ] }, { "cell_type": "markdown", "id": "ad6dba58-ed45-49f1-b003-853ddbbb84a9", "metadata": {}, "source": [ "### 4. Make sure there is one record for every development period (even if nothing happened in that period)\n", "* Create transactions_group for where if more than one payment was made in a development period, that will be grouped into a single record. Create pmt_no count\n", "* Create a range_payment_delay with just 40 development periods in it\n", "* Merge transactions and renge_payment_delay datasets to create clim_head_expeand_dev\n", " and create payment_period variable and is_settled flag \n", "* Merge transactions_group and claim_head_expand_dev to bring in payment size and pmt_no from transactions_group" ] }, { "cell_type": "code", "execution_count": 9, "id": "8e3bb4ef-c79e-48c4-9bf7-37e7b758078c", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
claim_nodevelopment_periodpayment_sizepmt_no
01513,226.341
111015,685.862
211814,643.283
3122170,041.894
412418,712.715
...............
18140366372,213.882
18141366392,292.753
181423663112,367.324
1814336631228,576.075
181443663132,585.466
\n", "

18145 rows × 4 columns

\n", "
" ], "text/plain": [ " claim_no development_period payment_size pmt_no\n", "0 1 5 13,226.34 1\n", "1 1 10 15,685.86 2\n", "2 1 18 14,643.28 3\n", "3 1 22 170,041.89 4\n", "4 1 24 18,712.71 5\n", "... ... ... ... ...\n", "18140 3663 7 2,213.88 2\n", "18141 3663 9 2,292.75 3\n", "18142 3663 11 2,367.32 4\n", "18143 3663 12 28,576.07 5\n", "18144 3663 13 2,585.46 6\n", "\n", "[18145 rows x 4 columns]" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Transactions summarised by claim/dev:\n", "transactions_group = (transactions\n", " .groupby([\"claim_no\", \"development_period\"], as_index=False)\n", " .agg({\"payment_size\": \"sum\", \"pmt_no\": \"max\"})\n", " .sort_values(by=[\"claim_no\", \"development_period\"])\n", ")\n", "transactions_group" ] }, { "cell_type": "code", "execution_count": 11, "id": "142ac2f5-90ee-4f55-8a4f-b8a046418d1b", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
development_period
00
11
22
33
44
55
66
77
88
99
1010
1111
1212
1313
1414
1515
1616
1717
1818
1919
2020
2121
2222
2323
2424
2525
2626
2727
2828
2929
3030
3131
3232
3333
3434
3535
3636
3737
3838
3939
\n", "
" ], "text/plain": [ " development_period\n", "0 0\n", "1 1\n", "2 2\n", "3 3\n", "4 4\n", "5 5\n", "6 6\n", "7 7\n", "8 8\n", "9 9\n", "10 10\n", "11 11\n", "12 12\n", "13 13\n", "14 14\n", "15 15\n", "16 16\n", "17 17\n", "18 18\n", "19 19\n", "20 20\n", "21 21\n", "22 22\n", "23 23\n", "24 24\n", "25 25\n", "26 26\n", "27 27\n", "28 28\n", "29 29\n", "30 30\n", "31 31\n", "32 32\n", "33 33\n", "34 34\n", "35 35\n", "36 36\n", "37 37\n", "38 38\n", "39 39" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "range_payment_delay = pd.DataFrame.from_dict({\"development_period\": range(0, maxdev)})\n", "range_payment_delay" ] }, { "cell_type": "code", "execution_count": 12, "id": "5dcfff2d-6d0d-4689-b6e5-95e7440f94a9", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
claim_noclaim_sizeoccurrence_periodoccurrence_timenoti_periodnotidelsetldelsettle_perioddevelopment_periodpayment_periodis_settled
01232,310.0910.7320.6623.212501False
11232,310.0910.7320.6623.212512False
21232,310.0910.7320.6623.212523False
31232,310.0910.7320.6623.212534False
41232,310.0910.7320.6623.212545False
....................................
146515366340,653.724039.87432.2910.67533575True
146516366340,653.724039.87432.2910.67533676True
146517366340,653.724039.87432.2910.67533777True
146518366340,653.724039.87432.2910.67533878True
146519366340,653.724039.87432.2910.67533979True
\n", "

146520 rows × 11 columns

\n", "
" ], "text/plain": [ " claim_no claim_size occurrence_period occurrence_time noti_period \\\n", "0 1 232,310.09 1 0.73 2 \n", "1 1 232,310.09 1 0.73 2 \n", "2 1 232,310.09 1 0.73 2 \n", "3 1 232,310.09 1 0.73 2 \n", "4 1 232,310.09 1 0.73 2 \n", "... ... ... ... ... ... \n", "146515 3663 40,653.72 40 39.87 43 \n", "146516 3663 40,653.72 40 39.87 43 \n", "146517 3663 40,653.72 40 39.87 43 \n", "146518 3663 40,653.72 40 39.87 43 \n", "146519 3663 40,653.72 40 39.87 43 \n", "\n", " notidel setldel settle_period development_period payment_period \\\n", "0 0.66 23.21 25 0 1 \n", "1 0.66 23.21 25 1 2 \n", "2 0.66 23.21 25 2 3 \n", "3 0.66 23.21 25 3 4 \n", "4 0.66 23.21 25 4 5 \n", "... ... ... ... ... ... \n", "146515 2.29 10.67 53 35 75 \n", "146516 2.29 10.67 53 36 76 \n", "146517 2.29 10.67 53 37 77 \n", "146518 2.29 10.67 53 38 78 \n", "146519 2.29 10.67 53 39 79 \n", "\n", " is_settled \n", "0 False \n", "1 False \n", "2 False \n", "3 False \n", "4 False \n", "... ... \n", "146515 True \n", "146516 True \n", "146517 True \n", "146518 True \n", "146519 True \n", "\n", "[146520 rows x 11 columns]" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Claims header + development periods\n", "claim_head_expand_dev = (\n", " transactions\n", " .loc[:, [\"claim_no\", \"claim_size\", \"occurrence_period\", \"occurrence_time\", \"noti_period\", \"notidel\", \"setldel\", \"settle_period\"]]\n", " .drop_duplicates()\n", ").merge(\n", " range_payment_delay,\n", " how=\"cross\"\n", ").assign(\n", " payment_period=lambda df: (df.occurrence_period + df.development_period),\n", " is_settled=lambda df: (df.occurrence_period + df.development_period) >= df.settle_period\n", ")\n", "claim_head_expand_dev" ] }, { "cell_type": "code", "execution_count": 13, "id": "fa2e080f-589c-4111-a04b-aacc1968d70d", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
claim_noclaim_sizeoccurrence_periodoccurrence_timenoti_periodnotidelsetldelsettle_perioddevelopment_periodpayment_periodis_settledpayment_sizepmt_no
01232,310.0910.7320.6623.212501False0.000.00
11232,310.0910.7320.6623.212512False0.000.00
21232,310.0910.7320.6623.212523False0.000.00
31232,310.0910.7320.6623.212534False0.000.00
41232,310.0910.7320.6623.212545False0.000.00
..........................................
146515366340,653.724039.87432.2910.67533575True0.000.00
146516366340,653.724039.87432.2910.67533676True0.000.00
146517366340,653.724039.87432.2910.67533777True0.000.00
146518366340,653.724039.87432.2910.67533878True0.000.00
146519366340,653.724039.87432.2910.67533979True0.000.00
\n", "

146520 rows × 13 columns

\n", "
" ], "text/plain": [ " claim_no claim_size occurrence_period occurrence_time noti_period \\\n", "0 1 232,310.09 1 0.73 2 \n", "1 1 232,310.09 1 0.73 2 \n", "2 1 232,310.09 1 0.73 2 \n", "3 1 232,310.09 1 0.73 2 \n", "4 1 232,310.09 1 0.73 2 \n", "... ... ... ... ... ... \n", "146515 3663 40,653.72 40 39.87 43 \n", "146516 3663 40,653.72 40 39.87 43 \n", "146517 3663 40,653.72 40 39.87 43 \n", "146518 3663 40,653.72 40 39.87 43 \n", "146519 3663 40,653.72 40 39.87 43 \n", "\n", " notidel setldel settle_period development_period payment_period \\\n", "0 0.66 23.21 25 0 1 \n", "1 0.66 23.21 25 1 2 \n", "2 0.66 23.21 25 2 3 \n", "3 0.66 23.21 25 3 4 \n", "4 0.66 23.21 25 4 5 \n", "... ... ... ... ... ... \n", "146515 2.29 10.67 53 35 75 \n", "146516 2.29 10.67 53 36 76 \n", "146517 2.29 10.67 53 37 77 \n", "146518 2.29 10.67 53 38 78 \n", "146519 2.29 10.67 53 39 79 \n", "\n", " is_settled payment_size pmt_no \n", "0 False 0.00 0.00 \n", "1 False 0.00 0.00 \n", "2 False 0.00 0.00 \n", "3 False 0.00 0.00 \n", "4 False 0.00 0.00 \n", "... ... ... ... \n", "146515 True 0.00 0.00 \n", "146516 True 0.00 0.00 \n", "146517 True 0.00 0.00 \n", "146518 True 0.00 0.00 \n", "146519 True 0.00 0.00 \n", "\n", "[146520 rows x 13 columns]" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# create the dataset\n", "dat = claim_head_expand_dev.merge(\n", " transactions_group,\n", " how=\"left\",\n", " on=[\"claim_no\", \"development_period\"],\n", ").fillna(0)\n", "dat" ] }, { "cell_type": "markdown", "id": "bce50c41-0c7e-4128-91fe-345f298277ea", "metadata": {}, "source": [ "Only include records for a claim that is after the notification period" ] }, { "cell_type": "code", "execution_count": 14, "id": "ebdcec41-1b32-49b9-bdd7-a1e8796a3483", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
claim_noclaim_sizeoccurrence_periodoccurrence_timenoti_periodnotidelsetldelsettle_perioddevelopment_periodpayment_periodis_settledpayment_sizepmt_no
11232,310.0910.7320.6623.212512False0.000.00
21232,310.0910.7320.6623.212523False0.000.00
31232,310.0910.7320.6623.212534False0.000.00
41232,310.0910.7320.6623.212545False0.000.00
51232,310.0910.7320.6623.212556False13,226.341.00
..........................................
146515366340,653.724039.87432.2910.67533575True0.000.00
146516366340,653.724039.87432.2910.67533676True0.000.00
146517366340,653.724039.87432.2910.67533777True0.000.00
146518366340,653.724039.87432.2910.67533878True0.000.00
146519366340,653.724039.87432.2910.67533979True0.000.00
\n", "

142841 rows × 13 columns

\n", "
" ], "text/plain": [ " claim_no claim_size occurrence_period occurrence_time noti_period \\\n", "1 1 232,310.09 1 0.73 2 \n", "2 1 232,310.09 1 0.73 2 \n", "3 1 232,310.09 1 0.73 2 \n", "4 1 232,310.09 1 0.73 2 \n", "5 1 232,310.09 1 0.73 2 \n", "... ... ... ... ... ... \n", "146515 3663 40,653.72 40 39.87 43 \n", "146516 3663 40,653.72 40 39.87 43 \n", "146517 3663 40,653.72 40 39.87 43 \n", "146518 3663 40,653.72 40 39.87 43 \n", "146519 3663 40,653.72 40 39.87 43 \n", "\n", " notidel setldel settle_period development_period payment_period \\\n", "1 0.66 23.21 25 1 2 \n", "2 0.66 23.21 25 2 3 \n", "3 0.66 23.21 25 3 4 \n", "4 0.66 23.21 25 4 5 \n", "5 0.66 23.21 25 5 6 \n", "... ... ... ... ... ... \n", "146515 2.29 10.67 53 35 75 \n", "146516 2.29 10.67 53 36 76 \n", "146517 2.29 10.67 53 37 77 \n", "146518 2.29 10.67 53 38 78 \n", "146519 2.29 10.67 53 39 79 \n", "\n", " is_settled payment_size pmt_no \n", "1 False 0.00 0.00 \n", "2 False 0.00 0.00 \n", "3 False 0.00 0.00 \n", "4 False 0.00 0.00 \n", "5 False 13,226.34 1.00 \n", "... ... ... ... \n", "146515 True 0.00 0.00 \n", "146516 True 0.00 0.00 \n", "146517 True 0.00 0.00 \n", "146518 True 0.00 0.00 \n", "146519 True 0.00 0.00 \n", "\n", "[142841 rows x 13 columns]" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dat = dat.loc[dat.payment_period >= dat.noti_period].copy()\n", "dat" ] }, { "cell_type": "markdown", "id": "bb12c91c-ac15-47b1-99a0-7d7266601b5d", "metadata": {}, "source": [ "### 5. Create variables and define which ones you want to use in the model" ] }, { "cell_type": "code", "execution_count": 15, "id": "6a5508a1-9daa-423b-937f-66260f368241", "metadata": {}, "outputs": [], "source": [ "# Clean close to zero values\n", "dat[\"payment_size\"] = np.where(abs(dat.payment_size) < 1e-2, 0.0, dat.payment_size)\n", "\n", "# Cumulative payments\n", "dat[\"payment_size_cumulative\"] = dat[[\"claim_no\", \"payment_size\"]].groupby('claim_no').cumsum()\n", "dat[\"log1_paid_cumulative\"] = np.log1p(dat.payment_size_cumulative)\n", "\n", "dat[\"payment_to_prior_period\"] = dat[\"payment_size_cumulative\"] - dat[\"payment_size\"]\n", "dat[\"has_payment_to_prior_period\"] = np.where(dat.payment_to_prior_period > 1e-2, 1, 0)\n", "\n", "dat[\"paid_dev_factor\"] = dat[\"payment_size_cumulative\"]/dat[\"payment_to_prior_period\"]*dat[\"has_payment_to_prior_period\"]\n", "\n", "dat[\"pmt_no\"] = dat.groupby(\"claim_no\")[\"pmt_no\"].cummax()\n", "\n", "dat[\"max_paid_dev_factor\"] = dat.groupby('claim_no')['paid_dev_factor'].cummax().fillna(0)\n", "dat[\"min_paid_dev_factor\"] = dat.groupby('claim_no')['paid_dev_factor'].cummin().fillna(0)" ] }, { "cell_type": "markdown", "id": "27582a55-af14-40ae-9fd6-e91f5414bfdd", "metadata": {}, "source": [ "Show records just for an individual claim - here claim_no 1" ] }, { "cell_type": "code", "execution_count": 17, "id": "e2d6d3de-357d-43ab-addb-802748ff2102", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
claim_noclaim_sizeoccurrence_periodoccurrence_timenoti_periodnotidelsetldelsettle_perioddevelopment_periodpayment_periodis_settledpayment_sizepmt_nopayment_size_cumulativelog1_paid_cumulativepayment_to_prior_periodhas_payment_to_prior_periodpaid_dev_factormax_paid_dev_factormin_paid_dev_factor
11232,310.0910.7320.6623.212512False0.000.000.000.000.000NaN0.000.00
21232,310.0910.7320.6623.212523False0.000.000.000.000.000NaN0.000.00
31232,310.0910.7320.6623.212534False0.000.000.000.000.000NaN0.000.00
41232,310.0910.7320.6623.212545False0.000.000.000.000.000NaN0.000.00
51232,310.0910.7320.6623.212556False13,226.341.0013,226.349.490.000NaN0.000.00
61232,310.0910.7320.6623.212567False0.001.0013,226.349.4913,226.3411.001.001.00
71232,310.0910.7320.6623.212578False0.001.0013,226.349.4913,226.3411.001.001.00
81232,310.0910.7320.6623.212589False0.001.0013,226.349.4913,226.3411.001.001.00
91232,310.0910.7320.6623.2125910False0.001.0013,226.349.4913,226.3411.001.001.00
101232,310.0910.7320.6623.21251011False15,685.862.0028,912.2010.2713,226.3412.192.191.00
111232,310.0910.7320.6623.21251112False0.002.0028,912.2010.2728,912.2011.002.191.00
121232,310.0910.7320.6623.21251213False0.002.0028,912.2010.2728,912.2011.002.191.00
131232,310.0910.7320.6623.21251314False0.002.0028,912.2010.2728,912.2011.002.191.00
141232,310.0910.7320.6623.21251415False0.002.0028,912.2010.2728,912.2011.002.191.00
151232,310.0910.7320.6623.21251516False0.002.0028,912.2010.2728,912.2011.002.191.00
161232,310.0910.7320.6623.21251617False0.002.0028,912.2010.2728,912.2011.002.191.00
171232,310.0910.7320.6623.21251718False0.002.0028,912.2010.2728,912.2011.002.191.00
181232,310.0910.7320.6623.21251819False14,643.283.0043,555.4810.6828,912.2011.512.191.00
191232,310.0910.7320.6623.21251920False0.003.0043,555.4810.6843,555.4811.002.191.00
201232,310.0910.7320.6623.21252021False0.003.0043,555.4810.6843,555.4811.002.191.00
211232,310.0910.7320.6623.21252122False0.003.0043,555.4810.6843,555.4811.002.191.00
221232,310.0910.7320.6623.21252223False170,041.894.00213,597.3812.2743,555.4814.904.901.00
231232,310.0910.7320.6623.21252324False0.004.00213,597.3812.27213,597.3811.004.901.00
241232,310.0910.7320.6623.21252425True18,712.715.00232,310.0912.36213,597.3811.094.901.00
251232,310.0910.7320.6623.21252526True0.005.00232,310.0912.36232,310.0911.004.901.00
261232,310.0910.7320.6623.21252627True0.005.00232,310.0912.36232,310.0911.004.901.00
271232,310.0910.7320.6623.21252728True0.005.00232,310.0912.36232,310.0911.004.901.00
281232,310.0910.7320.6623.21252829True0.005.00232,310.0912.36232,310.0911.004.901.00
291232,310.0910.7320.6623.21252930True0.005.00232,310.0912.36232,310.0911.004.901.00
301232,310.0910.7320.6623.21253031True0.005.00232,310.0912.36232,310.0911.004.901.00
311232,310.0910.7320.6623.21253132True0.005.00232,310.0912.36232,310.0911.004.901.00
321232,310.0910.7320.6623.21253233True0.005.00232,310.0912.36232,310.0911.004.901.00
331232,310.0910.7320.6623.21253334True0.005.00232,310.0912.36232,310.0911.004.901.00
341232,310.0910.7320.6623.21253435True0.005.00232,310.0912.36232,310.0911.004.901.00
351232,310.0910.7320.6623.21253536True0.005.00232,310.0912.36232,310.0911.004.901.00
361232,310.0910.7320.6623.21253637True0.005.00232,310.0912.36232,310.0911.004.901.00
371232,310.0910.7320.6623.21253738True0.005.00232,310.0912.36232,310.0911.004.901.00
381232,310.0910.7320.6623.21253839True0.005.00232,310.0912.36232,310.0911.004.901.00
391232,310.0910.7320.6623.21253940True0.005.00232,310.0912.36232,310.0911.004.901.00
\n", "
" ], "text/plain": [ " claim_no claim_size occurrence_period occurrence_time noti_period \\\n", "1 1 232,310.09 1 0.73 2 \n", "2 1 232,310.09 1 0.73 2 \n", "3 1 232,310.09 1 0.73 2 \n", "4 1 232,310.09 1 0.73 2 \n", "5 1 232,310.09 1 0.73 2 \n", "6 1 232,310.09 1 0.73 2 \n", "7 1 232,310.09 1 0.73 2 \n", "8 1 232,310.09 1 0.73 2 \n", "9 1 232,310.09 1 0.73 2 \n", "10 1 232,310.09 1 0.73 2 \n", "11 1 232,310.09 1 0.73 2 \n", "12 1 232,310.09 1 0.73 2 \n", "13 1 232,310.09 1 0.73 2 \n", "14 1 232,310.09 1 0.73 2 \n", "15 1 232,310.09 1 0.73 2 \n", "16 1 232,310.09 1 0.73 2 \n", "17 1 232,310.09 1 0.73 2 \n", "18 1 232,310.09 1 0.73 2 \n", "19 1 232,310.09 1 0.73 2 \n", "20 1 232,310.09 1 0.73 2 \n", "21 1 232,310.09 1 0.73 2 \n", "22 1 232,310.09 1 0.73 2 \n", "23 1 232,310.09 1 0.73 2 \n", "24 1 232,310.09 1 0.73 2 \n", "25 1 232,310.09 1 0.73 2 \n", "26 1 232,310.09 1 0.73 2 \n", "27 1 232,310.09 1 0.73 2 \n", "28 1 232,310.09 1 0.73 2 \n", "29 1 232,310.09 1 0.73 2 \n", "30 1 232,310.09 1 0.73 2 \n", "31 1 232,310.09 1 0.73 2 \n", "32 1 232,310.09 1 0.73 2 \n", "33 1 232,310.09 1 0.73 2 \n", "34 1 232,310.09 1 0.73 2 \n", "35 1 232,310.09 1 0.73 2 \n", "36 1 232,310.09 1 0.73 2 \n", "37 1 232,310.09 1 0.73 2 \n", "38 1 232,310.09 1 0.73 2 \n", "39 1 232,310.09 1 0.73 2 \n", "\n", " notidel setldel settle_period development_period payment_period \\\n", "1 0.66 23.21 25 1 2 \n", "2 0.66 23.21 25 2 3 \n", "3 0.66 23.21 25 3 4 \n", "4 0.66 23.21 25 4 5 \n", "5 0.66 23.21 25 5 6 \n", "6 0.66 23.21 25 6 7 \n", "7 0.66 23.21 25 7 8 \n", "8 0.66 23.21 25 8 9 \n", "9 0.66 23.21 25 9 10 \n", "10 0.66 23.21 25 10 11 \n", "11 0.66 23.21 25 11 12 \n", "12 0.66 23.21 25 12 13 \n", "13 0.66 23.21 25 13 14 \n", "14 0.66 23.21 25 14 15 \n", "15 0.66 23.21 25 15 16 \n", "16 0.66 23.21 25 16 17 \n", "17 0.66 23.21 25 17 18 \n", "18 0.66 23.21 25 18 19 \n", "19 0.66 23.21 25 19 20 \n", "20 0.66 23.21 25 20 21 \n", "21 0.66 23.21 25 21 22 \n", "22 0.66 23.21 25 22 23 \n", "23 0.66 23.21 25 23 24 \n", "24 0.66 23.21 25 24 25 \n", "25 0.66 23.21 25 25 26 \n", "26 0.66 23.21 25 26 27 \n", "27 0.66 23.21 25 27 28 \n", "28 0.66 23.21 25 28 29 \n", "29 0.66 23.21 25 29 30 \n", "30 0.66 23.21 25 30 31 \n", "31 0.66 23.21 25 31 32 \n", "32 0.66 23.21 25 32 33 \n", "33 0.66 23.21 25 33 34 \n", "34 0.66 23.21 25 34 35 \n", "35 0.66 23.21 25 35 36 \n", "36 0.66 23.21 25 36 37 \n", "37 0.66 23.21 25 37 38 \n", "38 0.66 23.21 25 38 39 \n", "39 0.66 23.21 25 39 40 \n", "\n", " is_settled payment_size pmt_no payment_size_cumulative \\\n", "1 False 0.00 0.00 0.00 \n", "2 False 0.00 0.00 0.00 \n", "3 False 0.00 0.00 0.00 \n", "4 False 0.00 0.00 0.00 \n", "5 False 13,226.34 1.00 13,226.34 \n", "6 False 0.00 1.00 13,226.34 \n", "7 False 0.00 1.00 13,226.34 \n", "8 False 0.00 1.00 13,226.34 \n", "9 False 0.00 1.00 13,226.34 \n", "10 False 15,685.86 2.00 28,912.20 \n", "11 False 0.00 2.00 28,912.20 \n", "12 False 0.00 2.00 28,912.20 \n", "13 False 0.00 2.00 28,912.20 \n", "14 False 0.00 2.00 28,912.20 \n", "15 False 0.00 2.00 28,912.20 \n", "16 False 0.00 2.00 28,912.20 \n", "17 False 0.00 2.00 28,912.20 \n", "18 False 14,643.28 3.00 43,555.48 \n", "19 False 0.00 3.00 43,555.48 \n", "20 False 0.00 3.00 43,555.48 \n", "21 False 0.00 3.00 43,555.48 \n", "22 False 170,041.89 4.00 213,597.38 \n", "23 False 0.00 4.00 213,597.38 \n", "24 True 18,712.71 5.00 232,310.09 \n", "25 True 0.00 5.00 232,310.09 \n", "26 True 0.00 5.00 232,310.09 \n", "27 True 0.00 5.00 232,310.09 \n", "28 True 0.00 5.00 232,310.09 \n", "29 True 0.00 5.00 232,310.09 \n", "30 True 0.00 5.00 232,310.09 \n", "31 True 0.00 5.00 232,310.09 \n", "32 True 0.00 5.00 232,310.09 \n", "33 True 0.00 5.00 232,310.09 \n", "34 True 0.00 5.00 232,310.09 \n", "35 True 0.00 5.00 232,310.09 \n", "36 True 0.00 5.00 232,310.09 \n", "37 True 0.00 5.00 232,310.09 \n", "38 True 0.00 5.00 232,310.09 \n", "39 True 0.00 5.00 232,310.09 \n", "\n", " log1_paid_cumulative payment_to_prior_period \\\n", "1 0.00 0.00 \n", "2 0.00 0.00 \n", "3 0.00 0.00 \n", "4 0.00 0.00 \n", "5 9.49 0.00 \n", "6 9.49 13,226.34 \n", "7 9.49 13,226.34 \n", "8 9.49 13,226.34 \n", "9 9.49 13,226.34 \n", "10 10.27 13,226.34 \n", "11 10.27 28,912.20 \n", "12 10.27 28,912.20 \n", "13 10.27 28,912.20 \n", "14 10.27 28,912.20 \n", "15 10.27 28,912.20 \n", "16 10.27 28,912.20 \n", "17 10.27 28,912.20 \n", "18 10.68 28,912.20 \n", "19 10.68 43,555.48 \n", "20 10.68 43,555.48 \n", "21 10.68 43,555.48 \n", "22 12.27 43,555.48 \n", "23 12.27 213,597.38 \n", "24 12.36 213,597.38 \n", "25 12.36 232,310.09 \n", "26 12.36 232,310.09 \n", "27 12.36 232,310.09 \n", "28 12.36 232,310.09 \n", "29 12.36 232,310.09 \n", "30 12.36 232,310.09 \n", "31 12.36 232,310.09 \n", "32 12.36 232,310.09 \n", "33 12.36 232,310.09 \n", "34 12.36 232,310.09 \n", "35 12.36 232,310.09 \n", "36 12.36 232,310.09 \n", "37 12.36 232,310.09 \n", "38 12.36 232,310.09 \n", "39 12.36 232,310.09 \n", "\n", " has_payment_to_prior_period paid_dev_factor max_paid_dev_factor \\\n", "1 0 NaN 0.00 \n", "2 0 NaN 0.00 \n", "3 0 NaN 0.00 \n", "4 0 NaN 0.00 \n", "5 0 NaN 0.00 \n", "6 1 1.00 1.00 \n", "7 1 1.00 1.00 \n", "8 1 1.00 1.00 \n", "9 1 1.00 1.00 \n", "10 1 2.19 2.19 \n", "11 1 1.00 2.19 \n", "12 1 1.00 2.19 \n", "13 1 1.00 2.19 \n", "14 1 1.00 2.19 \n", "15 1 1.00 2.19 \n", "16 1 1.00 2.19 \n", "17 1 1.00 2.19 \n", "18 1 1.51 2.19 \n", "19 1 1.00 2.19 \n", "20 1 1.00 2.19 \n", "21 1 1.00 2.19 \n", "22 1 4.90 4.90 \n", "23 1 1.00 4.90 \n", "24 1 1.09 4.90 \n", "25 1 1.00 4.90 \n", "26 1 1.00 4.90 \n", "27 1 1.00 4.90 \n", "28 1 1.00 4.90 \n", "29 1 1.00 4.90 \n", "30 1 1.00 4.90 \n", "31 1 1.00 4.90 \n", "32 1 1.00 4.90 \n", "33 1 1.00 4.90 \n", "34 1 1.00 4.90 \n", "35 1 1.00 4.90 \n", "36 1 1.00 4.90 \n", "37 1 1.00 4.90 \n", "38 1 1.00 4.90 \n", "39 1 1.00 4.90 \n", "\n", " min_paid_dev_factor \n", "1 0.00 \n", "2 0.00 \n", "3 0.00 \n", "4 0.00 \n", "5 0.00 \n", "6 1.00 \n", "7 1.00 \n", "8 1.00 \n", "9 1.00 \n", "10 1.00 \n", "11 1.00 \n", "12 1.00 \n", "13 1.00 \n", "14 1.00 \n", "15 1.00 \n", "16 1.00 \n", "17 1.00 \n", "18 1.00 \n", "19 1.00 \n", "20 1.00 \n", "21 1.00 \n", "22 1.00 \n", "23 1.00 \n", "24 1.00 \n", "25 1.00 \n", "26 1.00 \n", "27 1.00 \n", "28 1.00 \n", "29 1.00 \n", "30 1.00 \n", "31 1.00 \n", "32 1.00 \n", "33 1.00 \n", "34 1.00 \n", "35 1.00 \n", "36 1.00 \n", "37 1.00 \n", "38 1.00 \n", "39 1.00 " ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dat[dat['claim_no'] == 1]" ] }, { "cell_type": "markdown", "id": "88bd84b2-4d65-4ba6-aa01-5e082ccaeb67", "metadata": {}, "source": [ "Define which variables you want to use in the model\n", "\n", "output_field and youtput are also created to define the y variate for use in the model" ] }, { "cell_type": "code", "execution_count": 19, "id": "cb37ea5a-fadb-4a6a-85af-ae4e3d07f3b5", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
claim_nooccurrence_timenotideldevelopment_periodpmt_nolog1_paid_cumulativemax_paid_dev_factormin_paid_dev_factorclaim_size
110.730.6610.000.000.000.00232,310.09
210.730.6620.000.000.000.00232,310.09
310.730.6630.000.000.000.00232,310.09
410.730.6640.000.000.000.00232,310.09
510.730.6651.009.490.000.00232,310.09
..............................
146515366339.872.29356.0010.614.011.0040,653.72
146516366339.872.29366.0010.614.011.0040,653.72
146517366339.872.29376.0010.614.011.0040,653.72
146518366339.872.29386.0010.614.011.0040,653.72
146519366339.872.29396.0010.614.011.0040,653.72
\n", "

142841 rows × 9 columns

\n", "
" ], "text/plain": [ " claim_no occurrence_time notidel development_period pmt_no \\\n", "1 1 0.73 0.66 1 0.00 \n", "2 1 0.73 0.66 2 0.00 \n", "3 1 0.73 0.66 3 0.00 \n", "4 1 0.73 0.66 4 0.00 \n", "5 1 0.73 0.66 5 1.00 \n", "... ... ... ... ... ... \n", "146515 3663 39.87 2.29 35 6.00 \n", "146516 3663 39.87 2.29 36 6.00 \n", "146517 3663 39.87 2.29 37 6.00 \n", "146518 3663 39.87 2.29 38 6.00 \n", "146519 3663 39.87 2.29 39 6.00 \n", "\n", " log1_paid_cumulative max_paid_dev_factor min_paid_dev_factor \\\n", "1 0.00 0.00 0.00 \n", "2 0.00 0.00 0.00 \n", "3 0.00 0.00 0.00 \n", "4 0.00 0.00 0.00 \n", "5 9.49 0.00 0.00 \n", "... ... ... ... \n", "146515 10.61 4.01 1.00 \n", "146516 10.61 4.01 1.00 \n", "146517 10.61 4.01 1.00 \n", "146518 10.61 4.01 1.00 \n", "146519 10.61 4.01 1.00 \n", "\n", " claim_size \n", "1 232,310.09 \n", "2 232,310.09 \n", "3 232,310.09 \n", "4 232,310.09 \n", "5 232,310.09 \n", "... ... \n", "146515 40,653.72 \n", "146516 40,653.72 \n", "146517 40,653.72 \n", "146518 40,653.72 \n", "146519 40,653.72 \n", "\n", "[142841 rows x 9 columns]" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Potential features for model later:\n", "data_cols = [\n", " \"claim_no\",\n", " \"occurrence_time\", \n", " \"notidel\", \n", " \"development_period\", \n", " \"pmt_no\",\n", " \"log1_paid_cumulative\",\n", " \"max_paid_dev_factor\",\n", " \"min_paid_dev_factor\",\n", "]\n", "\n", "list_of_features = data_cols\n", "output_field = [\"claim_size\"]\n", "youtput=\"claim_size\"\n", "\n", "dat.loc[:, data_cols + [youtput]]\n", "#dat.loc[:, list_of_features + output_field]" ] }, { "cell_type": "markdown", "id": "077f8293-7706-4c51-b0c1-9326cb86e969", "metadata": {}, "source": [ "## 6. Create train and test datasets. \n", "We provide three types of indicators here for reference, but in our NN example we only use train_ind \n", "\n", "* train_ind creates a split by claim - 'rectangular data'\n", "* train_ind_time is what you would use if you wanted to split the datasets by calendar period\n", "* cv_ind is what you might use if you were doing cross validation" ] }, { "cell_type": "code", "execution_count": 20, "id": "0c4dc679-374d-492a-8ed1-ae90520af784", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
claim_noclaim_sizeoccurrence_periodoccurrence_timenoti_periodnotidelsetldelsettle_perioddevelopment_periodpayment_period...payment_size_cumulativelog1_paid_cumulativepayment_to_prior_periodhas_payment_to_prior_periodpaid_dev_factormax_paid_dev_factormin_paid_dev_factortrain_indtrain_ind_timecv_ind
11232,310.0910.7320.6623.212512...0.000.000.000NaN0.000.00FalseTrue2
21232,310.0910.7320.6623.212523...0.000.000.000NaN0.000.00FalseTrue3
31232,310.0910.7320.6623.212534...0.000.000.000NaN0.000.00FalseTrue4
41232,310.0910.7320.6623.212545...0.000.000.000NaN0.000.00FalseTrue0
51232,310.0910.7320.6623.212556...13,226.349.490.000NaN0.000.00FalseTrue1
..................................................................
146515366340,653.724039.87432.2910.67533575...40,653.7210.6140,653.7211.004.011.00FalseFalse0
146516366340,653.724039.87432.2910.67533676...40,653.7210.6140,653.7211.004.011.00FalseFalse1
146517366340,653.724039.87432.2910.67533777...40,653.7210.6140,653.7211.004.011.00FalseFalse2
146518366340,653.724039.87432.2910.67533878...40,653.7210.6140,653.7211.004.011.00FalseFalse3
146519366340,653.724039.87432.2910.67533979...40,653.7210.6140,653.7211.004.011.00FalseFalse4
\n", "

142841 rows × 23 columns

\n", "
" ], "text/plain": [ " claim_no claim_size occurrence_period occurrence_time noti_period \\\n", "1 1 232,310.09 1 0.73 2 \n", "2 1 232,310.09 1 0.73 2 \n", "3 1 232,310.09 1 0.73 2 \n", "4 1 232,310.09 1 0.73 2 \n", "5 1 232,310.09 1 0.73 2 \n", "... ... ... ... ... ... \n", "146515 3663 40,653.72 40 39.87 43 \n", "146516 3663 40,653.72 40 39.87 43 \n", "146517 3663 40,653.72 40 39.87 43 \n", "146518 3663 40,653.72 40 39.87 43 \n", "146519 3663 40,653.72 40 39.87 43 \n", "\n", " notidel setldel settle_period development_period payment_period \\\n", "1 0.66 23.21 25 1 2 \n", "2 0.66 23.21 25 2 3 \n", "3 0.66 23.21 25 3 4 \n", "4 0.66 23.21 25 4 5 \n", "5 0.66 23.21 25 5 6 \n", "... ... ... ... ... ... \n", "146515 2.29 10.67 53 35 75 \n", "146516 2.29 10.67 53 36 76 \n", "146517 2.29 10.67 53 37 77 \n", "146518 2.29 10.67 53 38 78 \n", "146519 2.29 10.67 53 39 79 \n", "\n", " ... payment_size_cumulative log1_paid_cumulative \\\n", "1 ... 0.00 0.00 \n", "2 ... 0.00 0.00 \n", "3 ... 0.00 0.00 \n", "4 ... 0.00 0.00 \n", "5 ... 13,226.34 9.49 \n", "... ... ... ... \n", "146515 ... 40,653.72 10.61 \n", "146516 ... 40,653.72 10.61 \n", "146517 ... 40,653.72 10.61 \n", "146518 ... 40,653.72 10.61 \n", "146519 ... 40,653.72 10.61 \n", "\n", " payment_to_prior_period has_payment_to_prior_period paid_dev_factor \\\n", "1 0.00 0 NaN \n", "2 0.00 0 NaN \n", "3 0.00 0 NaN \n", "4 0.00 0 NaN \n", "5 0.00 0 NaN \n", "... ... ... ... \n", "146515 40,653.72 1 1.00 \n", "146516 40,653.72 1 1.00 \n", "146517 40,653.72 1 1.00 \n", "146518 40,653.72 1 1.00 \n", "146519 40,653.72 1 1.00 \n", "\n", " max_paid_dev_factor min_paid_dev_factor train_ind train_ind_time \\\n", "1 0.00 0.00 False True \n", "2 0.00 0.00 False True \n", "3 0.00 0.00 False True \n", "4 0.00 0.00 False True \n", "5 0.00 0.00 False True \n", "... ... ... ... ... \n", "146515 4.01 1.00 False False \n", "146516 4.01 1.00 False False \n", "146517 4.01 1.00 False False \n", "146518 4.01 1.00 False False \n", "146519 4.01 1.00 False False \n", "\n", " cv_ind \n", "1 2 \n", "2 3 \n", "3 4 \n", "4 0 \n", "5 1 \n", "... ... \n", "146515 0 \n", "146516 1 \n", "146517 2 \n", "146518 3 \n", "146519 4 \n", "\n", "[142841 rows x 23 columns]" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "cutoff=maxdev\n", "\n", "dat[\"train_ind\"] = (dat.claim_no % 10 >= 4)\n", "dat[\"train_ind_time\"] = (dat.payment_period <= cutoff)\n", "dat[\"cv_ind\"] = dat.payment_period % 5\n", "dat" ] }, { "cell_type": "code", "execution_count": 21, "id": "3516e965-8337-4bcf-b5af-13f08a8c4902", "metadata": {}, "outputs": [], "source": [ "X_train = (dat.loc[(dat.train_ind == 1), list_of_features])\n", "y_train = (dat.loc[(dat.train_ind == 1), youtput])\n", "\n", "X_test = (dat.loc[(dat.train_ind == 0), list_of_features])\n", "y_test = (dat.loc[(dat.train_ind == 0), youtput])\n", "\n", "X = (dat.loc[:, list_of_features])\n", "y = (dat.loc[:, youtput])" ] }, { "cell_type": "markdown", "id": "c465b2f0-1f32-4cec-b9ed-87643674a439", "metadata": {}, "source": [ "Save file to disk. ***Remember to change the directory reference below to where you want it to be saved***. " ] }, { "cell_type": "code", "execution_count": 22, "id": "41cea055-ab71-45bc-bb3d-976c371bf2da", "metadata": {}, "outputs": [], "source": [ "dirname_in=\"/DataDirectoryPath/\"\n", "filename_in=\"data.csv\"\n", "\n", "dat.to_csv(\n", " dirname + filename\n", ")" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.7" } }, "nbformat": 4, "nbformat_minor": 5 }