A RetroSearch Logo

Home - News ( United States | United Kingdom | Italy | Germany ) - Football scores

Search Query:

Showing content from https://tensorflow.rstudio.com/examples/imbalanced_classification below:

TensorFlow for R - Imbalanced classification: credit card fraud detection

Introduction

This example looks at the Kaggle Credit Card Fraud Detection dataset to demonstrate how to train a classification model on data with highly imbalanced classes. You can download the data by clicking “Download” at the link, or if you’re setup with a kaggle API key at "~/.kaggle/kagle.json", you can run the following:

reticulate::py_install("kaggle", pip = TRUE)
system("kaggle datasets download -d mlg-ulb/creditcardfraud")
zip::unzip("creditcardfraud.zip", files = "creditcard.csv")
First, read in the CSV data
library(tensorflow)
library(keras)
set.seed(1234)
df <- readr::read_csv("creditcard.csv")
tibble::glimpse(df)
Rows: 284,807
Columns: 31
$ Time   <dbl> 0, 0, 1, 1, 2, 2, 4, 7, 7, 9, 10, 10, 10, 11, 12, 12, 12, 1…
$ V1     <dbl> -1.3598071, 1.1918571, -1.3583541, -0.9662717, -1.1582331, …
$ V2     <dbl> -0.07278117, 0.26615071, -1.34016307, -0.18522601, 0.877736…
$ V3     <dbl> 2.53634674, 0.16648011, 1.77320934, 1.79299334, 1.54871785,…
$ V4     <dbl> 1.37815522, 0.44815408, 0.37977959, -0.86329128, 0.40303393…
$ V5     <dbl> -0.33832077, 0.06001765, -0.50319813, -0.01030888, -0.40719…
$ V6     <dbl> 0.46238778, -0.08236081, 1.80049938, 1.24720317, 0.09592146…
$ V7     <dbl> 0.239598554, -0.078802983, 0.791460956, 0.237608940, 0.5929…
$ V8     <dbl> 0.098697901, 0.085101655, 0.247675787, 0.377435875, -0.2705…
$ V9     <dbl> 0.3637870, -0.2554251, -1.5146543, -1.3870241, 0.8177393, -…
$ V10    <dbl> 0.09079417, -0.16697441, 0.20764287, -0.05495192, 0.7530744…
$ V11    <dbl> -0.55159953, 1.61272666, 0.62450146, -0.22648726, -0.822842…
$ V12    <dbl> -0.61780086, 1.06523531, 0.06608369, 0.17822823, 0.53819555…
$ V13    <dbl> -0.99138985, 0.48909502, 0.71729273, 0.50775687, 1.34585159…
$ V14    <dbl> -0.31116935, -0.14377230, -0.16594592, -0.28792375, -1.1196…
$ V15    <dbl> 1.468176972, 0.635558093, 2.345864949, -0.631418118, 0.1751…
$ V16    <dbl> -0.47040053, 0.46391704, -2.89008319, -1.05964725, -0.45144…
$ V17    <dbl> 0.207971242, -0.114804663, 1.109969379, -0.684092786, -0.23…
$ V18    <dbl> 0.02579058, -0.18336127, -0.12135931, 1.96577500, -0.038194…
$ V19    <dbl> 0.40399296, -0.14578304, -2.26185710, -1.23262197, 0.803486…
$ V20    <dbl> 0.25141210, -0.06908314, 0.52497973, -0.20803778, 0.4085423…
$ V21    <dbl> -0.018306778, -0.225775248, 0.247998153, -0.108300452, -0.0…
$ V22    <dbl> 0.277837576, -0.638671953, 0.771679402, 0.005273597, 0.7982…
$ V23    <dbl> -0.110473910, 0.101288021, 0.909412262, -0.190320519, -0.13…
$ V24    <dbl> 0.06692807, -0.33984648, -0.68928096, -1.17557533, 0.141266…
$ V25    <dbl> 0.12853936, 0.16717040, -0.32764183, 0.64737603, -0.2060095…
$ V26    <dbl> -0.18911484, 0.12589453, -0.13909657, -0.22192884, 0.502292…
$ V27    <dbl> 0.133558377, -0.008983099, -0.055352794, 0.062722849, 0.219…
$ V28    <dbl> -0.021053053, 0.014724169, -0.059751841, 0.061457629, 0.215…
$ Amount <dbl> 149.62, 2.69, 378.66, 123.50, 69.99, 3.67, 4.99, 40.80, 93.…
$ Class  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
Prepare a validation set
val_idxs <- nrow(df) %>% sample.int(., ceiling( . * 0.2))
val_df <- df[val_idxs, ]
train_df <- df[-val_idxs, ]

sprintf("Number of training samples: %s", nrow(train_df))
[1] "Number of training samples: 227845"
sprintf("Number of validation samples: %s", nrow(val_df))
[1] "Number of validation samples: 56962"
Analyze class imbalance in the targets
train_df$Class %>% {
  cat(sprintf(
    "Number of positive samples in training data: %s (%.2f%% of total)\n",
    sum(.), 100 * mean(.)))
}
Number of positive samples in training data: 395 (0.17% of total)
weight_for_0 <- 1 / sum(train_df$Class == 0)
weight_for_1 <- 1 / sum(train_df$Class == 1)
Normalize the data using training set statistics
feature_names <- colnames(train_df) %>% setdiff("Class")

means <- lapply(train_df[feature_names], mean)
stds <- lapply(train_df[feature_names], sd)

for (name in feature_names) {
  train_df[[name]] %<>% { (. - means[[name]]) / stds[[name]] }
    val_df[[name]] %<>% { (. - means[[name]]) / stds[[name]] }
}
Build a binary classification model
model <- keras_model_sequential(input_shape = c(length(feature_names))) %>%
  layer_dense(256, activation = "relu") %>%
  layer_dense(256, activation = "relu") %>%
  layer_dropout(0.3) %>%
  layer_dense(256, activation = "relu") %>%
  layer_dropout(0.3) %>%
  layer_dense(1, activation = "sigmoid")

model
Model: "sequential"
____________________________________________________________________________
 Layer (type)                     Output Shape                  Param #     
============================================================================
 dense_3 (Dense)                  (None, 256)                   7936        
 dense_2 (Dense)                  (None, 256)                   65792       
 dropout_1 (Dropout)              (None, 256)                   0           
 dense_1 (Dense)                  (None, 256)                   65792       
 dropout (Dropout)                (None, 256)                   0           
 dense (Dense)                    (None, 1)                     257         
============================================================================
Total params: 139,777
Trainable params: 139,777
Non-trainable params: 0
____________________________________________________________________________
Train the model with class_weight argument
metrics <- list(
  metric_false_negatives(name = "fn"),
  metric_false_positives(name = "fp"),
  metric_true_negatives(name = "tn"),
  metric_true_positives(name = "tp"),
  metric_precision(name = "precision"),
  metric_recall(name = "recall")
)
model %>% compile(
  optimizer = optimizer_adam(1e-2),
  loss = "binary_crossentropy",
  metrics = metrics
)
class_weight <- list("0" = weight_for_0,
                     "1" = weight_for_1)
callbacks <- list(
  callback_model_checkpoint("fraud_model_at_epoch_{epoch}.h5"))

train_features <- as.matrix(train_df[feature_names])
train_targets <- as.matrix(train_df$Class)
validation_data <- list(
   as.matrix(val_df[feature_names]),
   as.matrix(val_df$Class))

model %>%
  fit(train_features, train_targets,
      validation_data = validation_data,
      class_weight = class_weight,
      batch_size = 2048, epochs = 30,
      callbacks = callbacks,
      verbose = 2)
val_pred <- model %>%
  predict(as.matrix(val_df[feature_names])) %>%
  { ifelse(. > .5, 1, 0) }

pred_correct <- val_df$Class == val_pred
cat(sprintf("Validation accuracy: %.2f", mean(pred_correct)))
Validation accuracy: 0.99
fraudulent <- val_df$Class == 1

n_fraudulent_detected <- sum(fraudulent & pred_correct)
n_fraudulent_missed <- sum(fraudulent & !pred_correct)
n_legitimate_flagged <- sum(!fraudulent & !pred_correct)
Conclusions

At the end of training, out of 56,962 validation transactions, we are:

In the real world, one would put an even higher weight on class 1, so as to reflect that False Negatives are more costly than False Positives.

Next time your credit card gets declined in an online purchase – this is why.


RetroSearch is an open source project built by @garambo | Open a GitHub Issue

Search and Browse the WWW like it's 1997 | Search results from DuckDuckGo

HTML: 3.2 | Encoding: UTF-8 | Version: 0.7.4