Data

The explore package offers a simplified way to use popular data sets or to create synthetic data for experimenting/teaching/training.

Use data

Penguins

This data set comes with the palmerpenguins package. It contains measurements for penguin species, island in Palmer Archipelago, size (flipper length, body mass, bill dimensions), and sex.

library(dplyr)
library(explore)

data <- use_data_penguins()
glimpse(data)
#> Rows: 344
#> Columns: 8
#> $ species           <fct> Adelie, Adelie, Adelie, Adelie, Adelie, Adelie, Adel…
#> $ island            <fct> Torgersen, Torgersen, Torgersen, Torgersen, Torgerse…
#> $ bill_length_mm    <dbl> 39.1, 39.5, 40.3, NA, 36.7, 39.3, 38.9, 39.2, 34.1, …
#> $ bill_depth_mm     <dbl> 18.7, 17.4, 18.0, NA, 19.3, 20.6, 17.8, 19.6, 18.1, …
#> $ flipper_length_mm <int> 181, 186, 195, NA, 193, 190, 181, 195, 193, 190, 186…
#> $ body_mass_g       <int> 3750, 3800, 3250, NA, 3450, 3650, 3625, 4675, 3475, …
#> $ sex               <fct> male, female, female, NA, female, male, female, male…
#> $ year              <int> 2007, 2007, 2007, 2007, 2007, 2007, 2007, 2007, 2007…

data <- use_data_penguins(short_names = TRUE)
glimpse(data)
#> Rows: 344
#> Columns: 8
#> $ species     <fct> Adelie, Adelie, Adelie, Adelie, Adelie, Adelie, Adelie, Ad…
#> $ island      <fct> Torgersen, Torgersen, Torgersen, Torgersen, Torgersen, Tor…
#> $ bill_len    <dbl> 39.1, 39.5, 40.3, NA, 36.7, 39.3, 38.9, 39.2, 34.1, 42.0, …
#> $ bill_dep    <dbl> 18.7, 17.4, 18.0, NA, 19.3, 20.6, 17.8, 19.6, 18.1, 20.2, …
#> $ flipper_len <int> 181, 186, 195, NA, 193, 190, 181, 195, 193, 190, 186, 180,…
#> $ body_mass   <int> 3750, 3800, 3250, NA, 3450, 3650, 3625, 4675, 3475, 4250, …
#> $ sex         <fct> male, female, female, NA, female, male, female, male, NA, …
#> $ year        <int> 2007, 2007, 2007, 2007, 2007, 2007, 2007, 2007, 2007, 2007…

Starwars

This data set comes with the dplyr package. It contains data of 87 star war characters.

data <- use_data_starwars()
glimpse(data)
#> Rows: 87
#> Columns: 14
#> $ name       <chr> "Luke Skywalker", "C-3PO", "R2-D2", "Darth Vader", "Leia Or…
#> $ height     <int> 172, 167, 96, 202, 150, 178, 165, 97, 183, 182, 188, 180, 2…
#> $ mass       <dbl> 77.0, 75.0, 32.0, 136.0, 49.0, 120.0, 75.0, 32.0, 84.0, 77.…
#> $ hair_color <chr> "blond", NA, NA, "none", "brown", "brown, grey", "brown", N…
#> $ skin_color <chr> "fair", "gold", "white, blue", "white", "light", "light", "…
#> $ eye_color  <chr> "blue", "yellow", "red", "yellow", "brown", "blue", "blue",…
#> $ birth_year <dbl> 19.0, 112.0, 33.0, 41.9, 19.0, 52.0, 47.0, NA, 24.0, 57.0, …
#> $ sex        <chr> "male", "none", "none", "male", "female", "male", "female",…
#> $ gender     <chr> "masculine", "masculine", "masculine", "masculine", "femini…
#> $ homeworld  <chr> "Tatooine", "Tatooine", "Naboo", "Tatooine", "Alderaan", "T…
#> $ species    <chr> "Human", "Droid", "Droid", "Human", "Human", "Human", "Huma…
#> $ films      <list> <"A New Hope", "The Empire Strikes Back", "Return of the J…
#> $ vehicles   <list> <"Snowspeeder", "Imperial Speeder Bike">, <>, <>, <>, "Imp…
#> $ starships  <list> <"X-wing", "Imperial shuttle">, <>, <>, "TIE Advanced x1",…

Diamonds

This data set comes with the ggplot2 package. It contains the prices and other attributes of almost 54,000 diamonds.

data <- use_data_diamonds()
glimpse(data)
#> Rows: 53,940
#> Columns: 10
#> $ carat   <dbl> 0.23, 0.21, 0.23, 0.29, 0.31, 0.24, 0.24, 0.26, 0.22, 0.23, 0.…
#> $ cut     <ord> Ideal, Premium, Good, Premium, Good, Very Good, Very Good, Ver…
#> $ color   <ord> E, E, E, I, J, J, I, H, E, H, J, J, F, J, E, E, I, J, J, J, I,…
#> $ clarity <ord> SI2, SI1, VS1, VS2, SI2, VVS2, VVS1, SI1, VS2, VS1, SI1, VS1, …
#> $ depth   <dbl> 61.5, 59.8, 56.9, 62.4, 63.3, 62.8, 62.3, 61.9, 65.1, 59.4, 64…
#> $ table   <dbl> 55, 61, 65, 58, 58, 57, 57, 55, 61, 61, 55, 56, 61, 54, 62, 58…
#> $ price   <int> 326, 326, 327, 334, 335, 336, 336, 337, 337, 338, 339, 340, 34…
#> $ x       <dbl> 3.95, 3.89, 4.05, 4.20, 4.34, 3.94, 3.95, 4.07, 3.87, 4.00, 4.…
#> $ y       <dbl> 3.98, 3.84, 4.07, 4.23, 4.35, 3.96, 3.98, 4.11, 3.78, 4.05, 4.…
#> $ z       <dbl> 2.43, 2.31, 2.31, 2.63, 2.75, 2.48, 2.47, 2.53, 2.49, 2.39, 2.…

Iris

This data set comes with base R. The data set gives the measurements in centimeters of the variables sepal length and width and petal length and width, respectively, for 50 flowers from each of 3 species of iris. The species are Iris setosa, versicolor, and virginica.

data <- use_data_iris()
glimpse(data)
#> Rows: 150
#> Columns: 5
#> $ Sepal.Length <dbl> 5.1, 4.9, 4.7, 4.6, 5.0, 5.4, 4.6, 5.0, 4.4, 4.9, 5.4, 4.…
#> $ Sepal.Width  <dbl> 3.5, 3.0, 3.2, 3.1, 3.6, 3.9, 3.4, 3.4, 2.9, 3.1, 3.7, 3.…
#> $ Petal.Length <dbl> 1.4, 1.4, 1.3, 1.5, 1.4, 1.7, 1.4, 1.5, 1.4, 1.5, 1.5, 1.…
#> $ Petal.Width  <dbl> 0.2, 0.2, 0.2, 0.2, 0.2, 0.4, 0.3, 0.2, 0.2, 0.1, 0.2, 0.…
#> $ Species      <fct> setosa, setosa, setosa, setosa, setosa, setosa, setosa, s…

mpg

This data set comes with the ggplot2 package. It contains a subset of the fuel economy data that the EPA makes available on https://fueleconomy.gov/. It contains only models which had a new release every year between 1999 and 2008 - this was used as a proxy for the popularity of the car.

data <- use_data_mpg()
glimpse(data)
#> Rows: 234
#> Columns: 11
#> $ manufacturer <chr> "audi", "audi", "audi", "audi", "audi", "audi", "audi", "…
#> $ model        <chr> "a4", "a4", "a4", "a4", "a4", "a4", "a4", "a4 quattro", "…
#> $ displ        <dbl> 1.8, 1.8, 2.0, 2.0, 2.8, 2.8, 3.1, 1.8, 1.8, 2.0, 2.0, 2.…
#> $ year         <int> 1999, 1999, 2008, 2008, 1999, 1999, 2008, 1999, 1999, 200…
#> $ cyl          <int> 4, 4, 4, 4, 6, 6, 6, 4, 4, 4, 4, 6, 6, 6, 6, 6, 6, 8, 8, …
#> $ trans        <chr> "auto(l5)", "manual(m5)", "manual(m6)", "auto(av)", "auto…
#> $ drv          <chr> "f", "f", "f", "f", "f", "f", "f", "4", "4", "4", "4", "4…
#> $ cty          <int> 18, 21, 20, 21, 16, 18, 18, 18, 16, 20, 19, 15, 17, 17, 1…
#> $ hwy          <int> 29, 29, 31, 30, 26, 26, 27, 26, 25, 28, 27, 25, 25, 25, 2…
#> $ fl           <chr> "p", "p", "p", "p", "p", "p", "p", "p", "p", "p", "p", "p…
#> $ class        <chr> "compact", "compact", "compact", "compact", "compact", "c…

mtcars

This data set comes with base R. The data was extracted from the 1974 Motor Trend US magazine, and comprises fuel consumption and 10 aspects of automobile design and performance for 32 automobiles (1973–74 models).

data <- use_data_mtcars()
glimpse(data)
#> Rows: 32
#> Columns: 11
#> $ mpg  <dbl> 21.0, 21.0, 22.8, 21.4, 18.7, 18.1, 14.3, 24.4, 22.8, 19.2, 17.8,…
#> $ cyl  <dbl> 6, 6, 4, 6, 8, 6, 8, 4, 4, 6, 6, 8, 8, 8, 8, 8, 8, 4, 4, 4, 4, 8,…
#> $ disp <dbl> 160.0, 160.0, 108.0, 258.0, 360.0, 225.0, 360.0, 146.7, 140.8, 16…
#> $ hp   <dbl> 110, 110, 93, 110, 175, 105, 245, 62, 95, 123, 123, 180, 180, 180…
#> $ drat <dbl> 3.90, 3.90, 3.85, 3.08, 3.15, 2.76, 3.21, 3.69, 3.92, 3.92, 3.92,…
#> $ wt   <dbl> 2.620, 2.875, 2.320, 3.215, 3.440, 3.460, 3.570, 3.190, 3.150, 3.…
#> $ qsec <dbl> 16.46, 17.02, 18.61, 19.44, 17.02, 20.22, 15.84, 20.00, 22.90, 18…
#> $ vs   <dbl> 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,…
#> $ am   <dbl> 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0,…
#> $ gear <dbl> 4, 4, 4, 3, 3, 3, 3, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 4, 4, 4, 3, 3,…
#> $ carb <dbl> 4, 4, 1, 1, 2, 1, 4, 2, 2, 4, 4, 3, 3, 3, 4, 4, 4, 1, 2, 1, 1, 2,…

Titanic

This data set comes with base R. Survival of passengers on the Titanic.

data <- use_data_titanic(count = FALSE)
glimpse(data)
#> Rows: 2,201
#> Columns: 4
#> $ Class    <chr> "3rd", "3rd", "3rd", "3rd", "3rd", "3rd", "3rd", "3rd", "3rd"…
#> $ Sex      <chr> "Male", "Male", "Male", "Male", "Male", "Male", "Male", "Male…
#> $ Age      <chr> "Child", "Child", "Child", "Child", "Child", "Child", "Child"…
#> $ Survived <chr> "No", "No", "No", "No", "No", "No", "No", "No", "No", "No", "…

data <- use_data_titanic(count = TRUE)
glimpse(data)
#> Rows: 32
#> Columns: 5
#> $ Class    <chr> "1st", "2nd", "3rd", "Crew", "1st", "2nd", "3rd", "Crew", "1s…
#> $ Sex      <chr> "Male", "Male", "Male", "Male", "Female", "Female", "Female",…
#> $ Age      <chr> "Child", "Child", "Child", "Child", "Child", "Child", "Child"…
#> $ Survived <chr> "No", "No", "No", "No", "No", "No", "No", "No", "No", "No", "…
#> $ n        <dbl> 0, 0, 35, 0, 0, 0, 17, 0, 118, 154, 387, 670, 4, 13, 89, 3, 5…

Beer

This data set is an incomplete collection of popular beers in Austria, Germany and Switzerland. Data are collected from various websites in 2023. Some of the collected data may be incorrect.

data <- use_data_beer()
glimpse(data)
#> Rows: 161
#> Columns: 11
#> $ name              <chr> "Puntigamer Maerzen", "Puntigamer PR0,0ST", "Puntiga…
#> $ brand             <chr> "Puntigamer", "Puntigamer", "Puntigamer", "Puntigame…
#> $ country           <chr> "Austria", "Austria", "Austria", "Austria", "Austria…
#> $ year              <dbl> 2023, 2023, 2023, 2023, 2023, 2023, 2023, 2023, 2023…
#> $ type              <chr> "Rest", "Alkoholfrei", "Rest", "Rest", "Rest", "Rest…
#> $ color_dark        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1…
#> $ alcohol_vol_pct   <dbl> 5.1, 0.0, 5.2, 6.0, 4.9, 5.2, 4.4, 0.5, 5.7, 5.3, 7.…
#> $ original_wort     <dbl> 11.5, 5.1, 12.1, 13.8, 11.5, 11.9, 11.1, 7.0, 13.2, …
#> $ energy_kcal_100ml <dbl> 40, 20, 43, 50, 42, 43, 42, 27, 48, 45, 58, 45, 43, …
#> $ carb_g_100ml      <dbl> 2.7, 4.4, 2.9, 3.6, 3.2, 3.0, 3.8, 5.7, 3.5, 3.3, 3.…
#> $ sugar_g_100ml     <dbl> 0.0, 1.2, 0.0, 0.0, 0.0, 0.0, 0.0, 2.7, 0.0, 0.0, 0.…

Create data

Artificial data that can be used for unit-testing or teaching.

A/B testing

data <- create_data_abtest()
glimpse(data)
#> Rows: 4
#> Columns: 3
#> $ group   <chr> "A", "A", "B", "B"
#> $ n       <dbl> 90, 10, 95, 5
#> $ success <dbl> 0, 1, 0, 1

App

data <- create_data_app(obs = 1000)
glimpse(data)
#> Rows: 1,000
#> Columns: 7
#> $ os           <chr> "Android", "iOS", "Android", "iOS", "Other", "Android", "…
#> $ free         <int> 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, …
#> $ downloads    <int> 5802, 5048, 4579, 3449, 2464, 11276, 4026, 6841, 10419, 5…
#> $ rating       <dbl> 4, 4, 3, 4, 1, 4, 5, 5, 4, 1, 1, 4, 4, 5, 5, 4, 3, 4, 2, …
#> $ type         <chr> "Kids", "Media", "Other", "Shopping", "Connect", "Learn",…
#> $ updates      <dbl> 63.00000, 58.00000, 62.00000, 44.00000, 24.00000, 75.0000…
#> $ screen_sizes <dbl> 3, 2, 3, 2, 1, 3, 1, 2, 2, 3, 1, 3, 2, 1, 3, 1, 4, 5, 3, …

Buy

data <- create_data_buy(obs = 1000)
glimpse(data)
#> Rows: 1,000
#> Columns: 13
#> $ period          <int> 202012, 202012, 202012, 202012, 202012, 202012, 202012…
#> $ buy             <int> 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, …
#> $ age             <int> 39, 57, 55, 66, 71, 44, 64, 51, 70, 44, 58, 47, 68, 71…
#> $ city_ind        <int> 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, …
#> $ female_ind      <int> 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, …
#> $ fixedvoice_ind  <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, …
#> $ fixeddata_ind   <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
#> $ fixedtv_ind     <int> 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, …
#> $ mobilevoice_ind <int> 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, …
#> $ mobiledata_prd  <chr> "NO", "NO", "MOBILE STICK", "NO", "BUSINESS", "BUSINES…
#> $ bbi_speed_ind   <int> 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, …
#> $ bbi_usg_gb      <int> 77, 49, 53, 44, 55, 93, 50, 64, 63, 87, 45, 45, 70, 79…
#> $ hh_single       <int> 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, …

Churn

data <- create_data_churn(obs = 1000)
glimpse(data)
#> Rows: 1,000
#> Columns: 9
#> $ price      <dbl> 29, 27, 29, 11, 18, 21, 19, 13, 29, 22, 13, 27, 17, 11, 16,…
#> $ type       <chr> "Premium", "Regular", "Premium", "Promo", "Promo", "Promo",…
#> $ usage      <dbl> 63.0, 39.0, 87.0, 29.0, 22.5, 8.0, 56.0, 94.5, 46.0, 76.0, …
#> $ shared     <int> 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1,…
#> $ device     <chr> "Computer", "Tablet", "Phone", "Tablet", "Computer", "Table…
#> $ newsletter <int> 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0,…
#> $ language   <chr> "sp", "sp", "sp", "sp", "en", "en", "fr", "en", "en", "de",…
#> $ duration   <int> 7, 47, 99, 33, 94, 17, 95, 92, 43, 16, 62, 14, 52, 20, 76, …
#> $ churn      <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1,…

Esoteric

data <- create_data_esoteric(obs = 1000)
glimpse(data)
#> Rows: 1,000
#> Columns: 6
#> $ starsign        <chr> "Leo", "Aquarius", "Virgo", "Pisces", "Aries", "Taurus…
#> $ chinese         <chr> "Dragon", "Monkey", "Tiger", "Pig", "Pig", "Horse", "D…
#> $ moon            <chr> "Waxing (+)", "Waxing (+)", "Waxing (+)", "Waning (-)"…
#> $ blood           <chr> "A+", "AB+", "0+", "0+", "A+", "0+", "B+", "0+", "0-",…
#> $ fingers_crossed <int> 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, …
#> $ success         <int> 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, …

data <- create_data_newsletter(obs = 1000)
glimpse(data)
#> Rows: 1,000
#> Columns: 6
#> $ sending_h <int> 14, 14, 13, 13, 14, 16, 15, 12, 13, 14, 16, 14, 14, 12, 15, …
#> $ message   <chr> "voucher", "news", "news", "news", "voucher", "news", "news"…
#> $ age       <int> 80, 25, 17, 78, 30, 76, 30, 64, 50, 72, 65, 23, 23, 19, 30, …
#> $ send      <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
#> $ click     <dbl> 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, …
#> $ buy       <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, …

Person

data <- create_data_person(obs = 1000)
glimpse(data)
#> Rows: 1,000
#> Columns: 15
#> $ age               <int> 46, 94, 66, 29, 82, 57, 65, 58, 29, 40, 84, 72, 24, …
#> $ gender            <chr> "Female", "Female", "Male", "Male", "Female", "Femal…
#> $ eye_color         <chr> "Blue", "Green", "Brown", "Green", "Brown", "Brown",…
#> $ shoe_size         <dbl> 45.2, 37.0, 45.0, 45.0, 39.0, 38.2, 41.2, 46.0, 40.0…
#> $ iq                <dbl> 141, 71, 80, 74, 119, 95, 97, 135, 88, 140, 71, 126,…
#> $ education         <int> 66, 41, 49, 49, 25, 68, 87, 46, 78, 14, 65, 62, 68, …
#> $ income            <dbl> 132.0, 95.0, 18.0, 54.0, 70.0, 128.0, 128.5, 32.0, 8…
#> $ handset           <chr> "Apple", "Apple", "Apple", "Android", "Apple", "Andr…
#> $ pet               <chr> "No", "Cat", "Other", "No", "Dog", "No", "Cat", "Dog…
#> $ favorite_pizza    <chr> "Pepperoni", "Hawai", "Margaritha", "Carciofi", "Mar…
#> $ favorite_icecream <chr> "Lemon", "Strawberry", "Vanilla", "Vanilla", "Apple"…
#> $ likes_garlic      <int> 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0…
#> $ likes_sushi       <int> 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1…
#> $ likes_beatles     <int> 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0…
#> $ likes_beer        <int> 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1…

Random

data <- create_data_random(obs = 1000)
glimpse(data)
#> Rows: 1,000
#> Columns: 12
#> $ id         <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, …
#> $ target_ind <int> 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0,…
#> $ var_1      <int> 27, 59, 16, 85, 85, 48, 77, 30, 7, 44, 46, 34, 19, 51, 2, 7…
#> $ var_2      <int> 16, 14, 15, 51, 49, 62, 45, 6, 1, 22, 85, 27, 60, 61, 99, 1…
#> $ var_3      <int> 21, 94, 38, 63, 18, 66, 73, 50, 87, 83, 98, 67, 64, 5, 19, …
#> $ var_4      <int> 30, 83, 59, 81, 29, 14, 89, 1, 57, 97, 27, 98, 4, 26, 26, 9…
#> $ var_5      <int> 25, 99, 72, 65, 24, 9, 30, 54, 78, 27, 32, 95, 49, 97, 85, …
#> $ var_6      <int> 44, 40, 37, 53, 7, 72, 24, 84, 100, 11, 49, 68, 82, 77, 43,…
#> $ var_7      <int> 93, 59, 8, 85, 3, 81, 39, 14, 67, 62, 45, 81, 87, 99, 40, 3…
#> $ var_8      <int> 58, 49, 74, 23, 75, 82, 10, 28, 2, 60, 99, 85, 59, 34, 65, …
#> $ var_9      <int> 80, 88, 24, 56, 90, 1, 16, 26, 77, 7, 90, 31, 89, 61, 46, 7…
#> $ var_10     <int> 31, 32, 87, 33, 13, 36, 93, 88, 82, 2, 63, 78, 72, 19, 58, …

Unfair

data <- create_data_unfair(obs = 1000)
glimpse(data)
#> Rows: 1,000
#> Columns: 22
#> $ age         <int> 46, 94, 66, 29, 82, 57, 65, 58, 29, 40, 84, 72, 24, 87, 41…
#> $ gender      <chr> "Female", "Female", "Male", "Male", "Female", "Female", "F…
#> $ eye_color   <chr> "Blue", "Green", "Blue", "Blue", "Blue", "Brown", "Brown",…
#> $ shoe_size   <dbl> 45.2, 37.0, 45.0, 45.0, 39.0, 38.2, 41.2, 46.0, 40.0, 42.0…
#> $ iq          <dbl> 141, 71, 80, 74, 119, 95, 97, 135, 88, 140, 71, 126, 106, …
#> $ education   <int> 66, 41, 49, 49, 25, 68, 87, 46, 78, 14, 65, 62, 68, 16, 69…
#> $ income      <dbl> 132.0, 95.0, 18.0, 54.0, 70.0, 128.0, 128.5, 32.0, 82.0, 9…
#> $ handset     <chr> "Apple", "Apple", "Apple", "Android", "Apple", "Android", …
#> $ pet         <chr> "Other", "Cat", "Cat", "Dog", "Cat", "No", "Dog", "No", "N…
#> $ smoking     <int> 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0…
#> $ name_arabic <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0…
#> $ outfit      <chr> "Casual", "Casual", "Casual", "Alternative", "Elegant", "A…
#> $ glasses     <int> 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1…
#> $ tatoos      <int> 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0…
#> $ kids        <int> 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0…
#> $ bad_debt    <dbl> 1, 0, 0, 0, 0, 0, 2, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0…
#> $ credit_card <chr> "No", "Master", "Master", "No", "No", "Visa", "Visa", "Vis…
#> $ left_handed <int> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0…
#> $ skin_color  <chr> "White", "Brown", "White", "White", "White", "White", "Bla…
#> $ religion    <chr> "Christian", "No", "Christian", "No", "Christian", "No", "…
#> $ internet_gb <dbl> 0.000000, 60.609298, 260.437887, 55.199729, 0.000000, 179.…
#> $ target_ind  <int> 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1…

Empty

Create an empty data set and add random variables.

data <- create_data_empty(obs = 1000) %>%
  add_var_random_01("smoking", prob = c(0.8, 0.2)) %>%
  add_var_random_cat("gender", 
                     cat = c("female", "male", "diverse"), 
                     prob = c(0.45, 0.45, 0.1)) %>%
  add_var_random_dbl("internet_usage", min_val = 0, max_val = 1000) %>%
  add_var_random_int("age", min_val = 18, max_val = 100) %>%
  add_var_random_moon() %>%
  add_var_random_starsign()
glimpse(data)
#> Rows: 1,000
#> Columns: 6
#> $ smoking         <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, …
#> $ gender          <chr> "male", "male", "female", "male", "female", "female", …
#> $ internet_usage  <dbl> 923.7630, 979.0669, 773.8658, 697.6332, 470.4925, 609.…
#> $ age             <int> 84, 54, 44, 45, 60, 73, 60, 74, 62, 46, 81, 95, 58, 19…
#> $ random_moon     <chr> "Waxing (+)", "Waning (-)", "Waning (-)", "Waxing (+)"…
#> $ random_starsign <chr> "Saggitarius", "Saggitarius", "Libra", "Pisces", "Pisc…

Data

Roland Krasser

2025-12-10

Use data

Penguins

Starwars

Diamonds

Iris

mpg

mtcars

Titanic

Beer

Create data

A/B testing

App

Buy

Churn

Esoteric

Person

Random

Unfair

Empty

Data

Roland Krasser

2025-12-10

Use data

Penguins

Starwars

Diamonds

Iris

mpg

mtcars

Titanic

Beer

Create data

A/B testing

App

Buy

Churn

Esoteric

Newsletter

Person

Random

Unfair

Empty