## ----include = FALSE----------------------------------------------------------
knitr::opts_chunk$set(eval=FALSE,
  collapse = TRUE,
  comment = "#>"
)

## ----setup, include=FALSE-----------------------------------------------------
# library(rPandas)

## -----------------------------------------------------------------------------
# reticulate::py_install("pandas")

## -----------------------------------------------------------------------------
# # Run this if you have connection issues
# rp_check_env()

## -----------------------------------------------------------------------------
# reticulate::conda_list()

## -----------------------------------------------------------------------------
# # Replace with the path from conda_list()
# reticulate::use_python(python = reticulate::conda_list()$python[1], required = TRUE)
# 
# # Or, if you prefer to use a conda environment by name:
# reticulate::use_condaenv("your_environment_name", required = TRUE)

## -----------------------------------------------------------------------------
# reticulate::use_python("/usr/local/bin/python3", required = TRUE)

## ----eval=TRUE----------------------------------------------------------------
# Make sure ggplot2 is installed to access the data
data(diamonds, package = "ggplot2")
head(diamonds)


## -----------------------------------------------------------------------------
# # Simple condition
# v1 <- rp_filter(diamonds, carat > 1)
# print(head(v1))
# #>   carat       cut color clarity depth table price    x    y    z
# #> 1  1.17 Very Good     J      I1  60.2    61  2774 6.83 6.90 4.13
# #> 2  1.01   Premium     F      I1  61.8    60  2781 6.39 6.36 3.94
# #> 3  1.01      Fair     E      I1  64.5    58  2788 6.29 6.21 4.03
# #> 4  1.01   Premium     H     SI2  62.7    59  2788 6.31 6.22 3.93
# #> 5  1.05 Very Good     J     SI2  63.2    56  2789 6.49 6.45 4.09
# #> 6  1.05      Fair     J     SI2  65.8    59  2789 6.41 6.27 4.18
# 
# # AND: multiple conditions
# v2 <- rp_filter(diamonds, carat > 1 & cut == "Ideal")
# print(head(v2))
# #>   carat   cut color clarity depth table price    x    y    z
# #> 1  1.01 Ideal     I      I1  61.5    57  2844 6.45 6.46 3.97
# #> 2  1.02 Ideal     H     SI2  61.6    55  2856 6.49 6.43 3.98
# #> 3  1.02 Ideal     I      I1  61.7    56  2872 6.44 6.49 3.99
# #> 4  1.02 Ideal     J     SI2  60.3    54  2879 6.53 6.50 3.93
# #> 5  1.01 Ideal     I      I1  61.5    57  2896 6.46 6.45 3.97
# #> 6  1.02 Ideal     I      I1  61.7    56  2925 6.49 6.44 3.99
# 
# # OR: use | (pipe)
# v3 <- rp_filter(diamonds, color == "D" | color == "E")
# print(head(v3))
# #>   carat     cut color clarity depth table price    x    y    z
# #> 1  0.23   Ideal     E     SI2  61.5    55   326 3.95 3.98 2.43
# #> 2  0.21 Premium     E     SI1  59.8    61   326 3.89 3.84 2.31
# #> 3  0.23    Good     E     VS1  56.9    65   327 4.05 4.07 2.31
# #> 4  0.22    Fair     E     VS2  65.1    61   337 3.87 3.78 2.49
# #> 5  0.20 Premium     E     SI2  60.2    62   345 3.79 3.75 2.27
# #> 6  0.32 Premium     E      I1  60.9    58   345 4.38 4.42 2.68
# 
# 
# # NOT: use !
# v4 <- rp_filter(diamonds, !(price > 10000))
# print(head(v4))
# #>   carat       cut color clarity depth table price    x    y    z
# #> 1  0.23     Ideal     E     SI2  61.5    55   326 3.95 3.98 2.43
# #> 2  0.21   Premium     E     SI1  59.8    61   326 3.89 3.84 2.31
# #> 3  0.23      Good     E     VS1  56.9    65   327 4.05 4.07 2.31
# #> 4  0.29   Premium     I     VS2  62.4    58   334 4.20 4.23 2.63
# #> 5  0.31      Good     J     SI2  63.3    58   335 4.34 4.35 2.75
# #> 6  0.24 Very Good     J    VVS2  62.8    57   336 3.94 3.96 2.48
# 
# 
# # %in% operator
# v5 <- rp_filter(diamonds, color %in% c("D", "E", "F"))
# print(head(v5))
# #>   carat     cut color clarity depth table price    x    y    z
# #> 1  0.23   Ideal     E     SI2  61.5    55   326 3.95 3.98 2.43
# #> 2  0.21 Premium     E     SI1  59.8    61   326 3.89 3.84 2.31
# #> 3  0.23    Good     E     VS1  56.9    65   327 4.05 4.07 2.31
# #> 4  0.22    Fair     E     VS2  65.1    61   337 3.87 3.78 2.49
# #> 5  0.22 Premium     F     SI1  60.4    61   342 3.88 3.84 2.33
# #> 6  0.20 Premium     E     SI2  60.2    62   345 3.79 3.75 2.27
# 
# # %notin% (if implemented)
# v6 <- rp_filter(diamonds, color %notin% c("D", "E", "F"))
# print(head(v6))
# #>   carat       cut color clarity depth table price    x    y    z
# #> 1  0.29   Premium     I     VS2  62.4    58   334 4.20 4.23 2.63
# #> 2  0.31      Good     J     SI2  63.3    58   335 4.34 4.35 2.75
# #> 3  0.24 Very Good     J    VVS2  62.8    57   336 3.94 3.96 2.48
# #> 4  0.24 Very Good     I    VVS1  62.3    57   336 3.95 3.98 2.47
# #> 5  0.26 Very Good     H     SI1  61.9    55   337 4.07 4.11 2.53
# #> 6  0.23 Very Good     H     VS1  59.4    61   338 4.00 4.05 2.39

## -----------------------------------------------------------------------------
# # Select three columns
# v4 <- rp_select(diamonds, carat, cut, price)
# print(head(v4))
# #>   carat       cut price
# #> 1  0.23     Ideal   326
# #> 2  0.21   Premium   326
# #> 3  0.23      Good   327
# #> 4  0.29   Premium   334
# #> 5  0.31      Good   335
# #> 6  0.24 Very Good   336

## -----------------------------------------------------------------------------
# # Sort by price (ascending by default)
# v8 <- rp_sort(diamonds, price)
# print(head(v8))
# #>   carat       cut color clarity depth table price    x    y    z
# #> 1  0.23     Ideal     E     SI2  61.5    55   326 3.95 3.98 2.43
# #> 2  0.21   Premium     E     SI1  59.8    61   326 3.89 3.84 2.31
# #> 3  0.23      Good     E     VS1  56.9    65   327 4.05 4.07 2.31
# #> 4  0.29   Premium     I     VS2  62.4    58   334 4.20 4.23 2.63
# #> 5  0.31      Good     J     SI2  63.3    58   335 4.34 4.35 2.75
# #> 6  0.24 Very Good     I    VVS1  62.3    57   336 3.95 3.98 2.47
# 
# # Use desc() to sort in descending order
# v9 <- rp_sort(diamonds, desc(price))
# print(head(v9))
# #>   carat       cut color clarity depth table price    x    y    z
# #> 1  2.29   Premium     I     VS2  60.8    60 18823 8.50 8.47 5.16
# #> 2  2.00 Very Good     G     SI1  63.5    56 18818 7.90 7.97 5.04
# #> 3  1.51     Ideal     G      IF  61.7    55 18806 7.37 7.41 4.56
# #> 4  2.07     Ideal     G     SI2  62.5    55 18804 8.20 8.13 5.11
# #> 5  2.00 Very Good     H     SI1  62.8    57 18803 7.95 8.00 5.01
# #> 6  2.29   Premium     I     SI1  61.8    59 18797 8.52 8.45 5.24
# 
# # Sort by multiple columns
# v10 <- rp_sort(diamonds, cut, desc(price))
# print(head(v10))
# #>   carat  cut color clarity depth table price     x     y    z
# #> 1  2.01 Fair     G     SI1  70.6    64 18574  7.43  6.64 4.69
# #> 2  2.02 Fair     H     VS2  64.5    57 18565  8.00  7.95 5.14
# #> 3  4.50 Fair     J      I1  65.8    58 18531 10.23 10.16 6.72
# #> 4  2.00 Fair     G     VS2  67.6    58 18515  7.65  7.61 5.16
# #> 5  2.51 Fair     H     SI2  64.7    57 18308  8.44  8.50 5.48
# #> 6  3.01 Fair     I     SI2  65.8    56 18242  8.99  8.94 5.90

## -----------------------------------------------------------------------------
# 
# # Create a new column
# v11 <- rp_mutate(diamonds, price_per_carat = price / carat)
# print(head(v11))
# #>   carat       cut color clarity depth table price    x    y    z
# #> 1  0.23     Ideal     E     SI2  61.5    55   326 3.95 3.98 2.43
# #> 2  0.21   Premium     E     SI1  59.8    61   326 3.89 3.84 2.31
# #> 3  0.23      Good     E     VS1  56.9    65   327 4.05 4.07 2.31
# #> 4  0.29   Premium     I     VS2  62.4    58   334 4.20 4.23 2.63
# #> 5  0.31      Good     J     SI2  63.3    58   335 4.34 4.35 2.75
# #> 6  0.24 Very Good     J    VVS2  62.8    57   336 3.94 3.96 2.48
# #>   price_per_carat
# #> 1        1417.391
# #> 2        1552.381
# #> 3        1421.739
# #> 4        1151.724
# #> 5        1080.645
# #> 6        1400.000
# 
# # Create multiple columns
# v12 <- rp_mutate(
#   diamonds,
#   price_per_carat = price / carat,
#   depth_pct = depth / 100
# )
# print(head(v12))
# #>   carat       cut color clarity depth table price    x    y    z
# #> 1  0.23     Ideal     E     SI2  61.5    55   326 3.95 3.98 2.43
# #> 2  0.21   Premium     E     SI1  59.8    61   326 3.89 3.84 2.31
# #> 3  0.23      Good     E     VS1  56.9    65   327 4.05 4.07 2.31
# #> 4  0.29   Premium     I     VS2  62.4    58   334 4.20 4.23 2.63
# #> 5  0.31      Good     J     SI2  63.3    58   335 4.34 4.35 2.75
# #> 6  0.24 Very Good     J    VVS2  62.8    57   336 3.94 3.96 2.48
# #>   price_per_carat depth_pct
# #> 1        1417.391     0.615
# #> 2        1552.381     0.598
# #> 3        1421.739     0.569
# #> 4        1151.724     0.624
# #> 5        1080.645     0.633
# #> 6        1400.000     0.628
# 
# # Remove one or more columns
# v13 <- rp_mutate(diamonds, to_remove = c("table", "depth"))
# print(head(v13))
# #>   carat       cut color clarity price    x    y    z
# #> 1  0.23     Ideal     E     SI2   326 3.95 3.98 2.43
# #> 2  0.21   Premium     E     SI1   326 3.89 3.84 2.31
# #> 3  0.23      Good     E     VS1   327 4.05 4.07 2.31
# #> 4  0.29   Premium     I     VS2   334 4.20 4.23 2.63
# #> 5  0.31      Good     J     SI2   335 4.34 4.35 2.75
# #> 6  0.24 Very Good     J    VVS2   336 3.94 3.96 2.48

## -----------------------------------------------------------------------------
# 
# # Summarize the entire data frame
# v14 <- rp_summarize(diamonds, avg_price = mean(price), max_carat = max(carat))
# print(v14)
# #>    price carat
# #> 1 3932.8   NaN
# #> 2    NaN  5.01
# 
# 
# # Group by one column (unquoted)
# v15 <- rp_summarize(diamonds, avg_price = mean(price), .by = cut)
# print(v15)
# #>         cut avg_price
# #> 1      Fair  4358.758
# #> 2      Good  3928.864
# #> 3 Very Good  3981.760
# #> 4   Premium  4584.258
# #> 5     Ideal  3457.542
# 
# # Group by multiple columns (unquoted)
# v16 <- rp_summarize(
#   diamonds,
#   avg_price = mean(price),
#   count = n(),
#   .by = c(cut, color)
# )
# print(head(v16))
# #>    cut color avg_price count
# #> 1 Fair     D  4291.061   163
# #> 2 Fair     E  3682.312   224
# #> 3 Fair     F  3827.003   312
# #> 4 Fair     G  4239.255   314
# #> 5 Fair     H  5135.683   303
# #> 6 Fair     I  4685.446   175
# 
# # Grouping also accepts quoted column names
# v17 <- rp_summarize(
#   diamonds,
#   avg_price = mean(price),
#   .by = c("cut", "color")
# )
# print(head(v17))
# #>    cut color avg_price
# #> 1 Fair     D  4291.061
# #> 2 Fair     E  3682.312
# #> 3 Fair     F  3827.003
# #> 4 Fair     G  4239.255
# #> 5 Fair     H  5135.683
# #> 6 Fair     I  4685.446

## -----------------------------------------------------------------------------
# 
# # Apply two functions to two columns, grouped by 'cut'
# v13 <- rp_calculate(
#   diamonds,
#   price, carat,
#   the.functions = c("mean", "sd"),
#   .by = cut
# )
# print(head(v13))
# #>         cut price.mean price.std carat.mean carat.std
# #> 1      Fair   4358.758  3560.387  1.0461366 0.5164043
# #> 2      Good   3928.864  3681.590  0.8491847 0.4540544
# #> 3 Very Good   3981.760  3935.862  0.8063814 0.4594354
# #> 4   Premium   4584.258  4349.205  0.8919549 0.5152616
# #> 5     Ideal   3457.542  3808.401  0.7028370 0.4328763

## -----------------------------------------------------------------------------
# # First 3 rows overall
# v19 <- rp_first_k_rows(diamonds, k = 3)
# print(v19)
# #>   carat     cut color clarity depth table price    x    y    z
# #> 1  0.23   Ideal     E     SI2  61.5    55   326 3.95 3.98 2.43
# #> 2  0.21 Premium     E     SI1  59.8    61   326 3.89 3.84 2.31
# #> 3  0.23    Good     E     VS1  56.9    65   327 4.05 4.07 2.31
# 
# # Last 2 rows per group (cut and clarity)
# v20 <- rp_last_k_rows(diamonds, k = 2, .by = c(cut, clarity))
# print(head(v20))
# #>   carat     cut color clarity depth table price    x    y    z
# #> 1  0.70    Fair     J    VVS1  67.6    54  1691 5.56 5.41 3.71
# #> 2  0.50    Fair     D    VVS1  65.9    64  1792 4.92 5.03 3.28
# #> 3  0.52    Fair     F      IF  64.6    58  2144 5.04 5.17 3.30
# #> 4  0.47    Fair     D      IF  60.6    60  2211 5.09 4.98 3.05
# #> 5  0.55    Good     F      IF  60.8    60  2266 5.26 5.36 3.23
# #> 6  0.54 Premium     F      IF  61.9    60  2391 5.26 5.21 3.24
# 
# # Both quoted and unquoted group specifications work
# v21 <- rp_first_k_rows(diamonds, k = 1, .by = c("cut", "color"))
# print(v21)
# #>    carat       cut color clarity depth table price    x    y    z
# #> 1   0.23     Ideal     E     SI2  61.5  55.0   326 3.95 3.98 2.43
# #> 2   0.21   Premium     E     SI1  59.8  61.0   326 3.89 3.84 2.31
# #> 3   0.23      Good     E     VS1  56.9  65.0   327 4.05 4.07 2.31
# #> 4   0.29   Premium     I     VS2  62.4  58.0   334 4.20 4.23 2.63
# #> 5   0.31      Good     J     SI2  63.3  58.0   335 4.34 4.35 2.75
# #> 6   0.24 Very Good     J    VVS2  62.8  57.0   336 3.94 3.96 2.48
# #> 7   0.24 Very Good     I    VVS1  62.3  57.0   336 3.95 3.98 2.47
# #> 8   0.26 Very Good     H     SI1  61.9  55.0   337 4.07 4.11 2.53
# #> 9   0.22      Fair     E     VS2  65.1  61.0   337 3.87 3.78 2.49
# #> 10  0.23     Ideal     J     VS1  62.8  56.0   340 3.93 3.90 2.46
# #> 11  0.22   Premium     F     SI1  60.4  61.0   342 3.88 3.84 2.33
# #> 12  0.30     Ideal     I     SI2  62.0  54.0   348 4.31 4.34 2.68
# #> 13  0.30      Good     I     SI2  63.3  56.0   351 4.26 4.30 2.71
# #> 14  0.23 Very Good     E     VS2  63.8  55.0   352 3.85 3.92 2.48
# #> 15  0.23 Very Good     G    VVS2  60.4  58.0   354 3.97 4.01 2.41
# #> 16  0.23 Very Good     D     VS2  60.5  61.0   357 3.96 3.97 2.40
# #> 17  0.23 Very Good     F     VS1  60.9  57.0   357 3.96 3.99 2.42
# #> 18  0.23      Good     F     VS1  58.2  59.0   402 4.06 4.08 2.37
# #> 19  0.31      Good     H     SI1  64.0  54.0   402 4.29 4.31 2.75
# #> 20  0.26      Good     D     VS2  65.2  56.0   403 3.99 4.02 2.61
# #> 21  0.23     Ideal     G     VS1  61.9  54.0   404 3.93 3.95 2.44
# #> 22  0.22   Premium     D     VS2  59.3  62.0   404 3.91 3.88 2.31
# #> 23  0.30   Premium     J     SI2  59.3  61.0   405 4.43 4.38 2.61
# #> 24  0.30     Ideal     D     SI1  62.5  57.0   552 4.29 4.32 2.69
# #> 25  0.31   Premium     G     SI1  61.8  58.0   553 4.35 4.32 2.68
# #> 26  0.30   Premium     H     SI1  62.9  59.0   554 4.28 4.24 2.68
# #> 27  0.96      Fair     F     SI2  66.3  62.0  2759 6.27 5.95 4.07
# #> 28  0.81     Ideal     F     SI2  58.8  57.0  2761 6.14 6.11 3.60
# #> 29  0.91      Fair     H     SI2  64.4  57.0  2763 6.11 6.09 3.93
# #> 30  0.77     Ideal     H     VS2  62.0  56.0  2763 5.89 5.86 3.64
# #> 31  0.72      Good     G     VS2  59.7  60.5  2776 5.80 5.84 3.47
# #> 32  0.84      Fair     G     SI1  55.1  67.0  2782 6.39 6.20 3.47
# #> 33  1.05      Fair     J     SI2  65.8  59.0  2789 6.41 6.27 4.18
# #> 34  0.90      Fair     I     SI1  67.3  59.0  2804 5.93 5.84 3.96
# #> 35  0.75      Fair     D     SI2  64.6  57.0  2848 5.74 5.72 3.70
# 

## -----------------------------------------------------------------------------
# # Total row count
# v22 <- rp_count(diamonds)
# print(v22)
# #>       n
# #> 1 53940
# 
# # Count per group
# v23 <- rp_count(diamonds, .by = cut)
# print(v23)
# #>         cut     n
# #> 1      Fair  1610
# #> 2      Good  4906
# #> 3 Very Good 12082
# #> 4   Premium 13791
# #> 5     Ideal 21551
# 
# # Count per combination of multiple groups
# v24 <- rp_count(diamonds, .by = c(cut, color))
# print(head(v24))
# #>    cut color   n
# #> 1 Fair     D 163
# #> 2 Fair     E 224
# #> 3 Fair     F 312
# #> 4 Fair     G 314
# #> 5 Fair     H 303
# #> 6 Fair     I 175

## -----------------------------------------------------------------------------
# # Load the pipe
# v25 <- diamonds |>
#   rp_filter(carat > 1 & color == "D") |>
#   rp_mutate(price_per_carat = price / carat) |>
#   rp_summarize(avg_ppc = mean(price_per_carat), .by = cut) |>
#   rp_sort(desc(avg_ppc))
# 
# print(head(v25))
# #>         cut  avg_ppc
# #> 1     Ideal 7546.163
# #> 2 Very Good 6789.316
# #> 3   Premium 6548.397
# #> 4      Good 5784.918
# #> 5      Fair 5414.87

## -----------------------------------------------------------------------------
# # See the code for a simple filter
# rp_filter(diamonds, carat > 1 & price < 400, return.as = "code")
# #> [1] "df.query('(carat > 1) and (price < 400)')"
# 
# # See the code for a mutate
# rp_mutate(diamonds, ppc = price / carat, return.as = "code")
# #> [1] "df.assign(ppc = lambda x: (x['price'] / x['carat']))"
# 
# # See the code for a complex summary
# rp_summarize(
#   diamonds,
#   avg_price = mean(price),
#   count = n(),
#   .by = c(cut, color),
#   return.as = "code"
# )
# #> [1] "df.groupby(['cut', 'color'], as_index=False, observed=True).agg(avg_price = ('price', 'mean'), count = ('price', 'size'))"

## -----------------------------------------------------------------------------
# # Default placeholder
# rp_filter(diamonds, carat > 1, return.as = "code")
# #> [1] "df.query('carat > 1')"
# 
# # With custom table name
# rp_filter(diamonds, carat > 1, table_name = "diamonds", return.as = "code")
# #> [1] "diamonds.query('carat > 1')"
# 

