-1

I have the following dataset (Break_data) collected from the school calendar starting and ending of the breaks:

 print(Break_data)

 Start        End          Break       Year
1 2016-02-24 2016-02-29   Spring_Break 2016
2 2016-03-23 2016-03-28  Easter_Recess 2016
3 2016-10-05 2016-10-10 Mid_Term_Break 2016
4 2017-03-01 2017-03-06   Spring_Break 2017
5 2017-04-12 2017-04-17  Easter_Recess 2017
6 2017-10-04 2017-10-09 Mid_Term_Break 2017
7 2018-02-28 2018-03-05   Spring_Break 2018
8 2018-03-28 2018-04-02  Easter_Recess 2018

And this is a very large dataset

head(df$date)
[1] "2016-02-05" "2016-02-05" "2016-02-05" "2016-02-05" "2016-02-05" "2016-02-05"

tail(df$date)
[1] "2018-07-12" "2018-07-12" "2018-07-12" "2018-07-12" "2018-07-12" "2018-07-12"

Following the steps provided in:https://stackoverflow.com/a/51052626/9341589

I want to create a similar factor variable Break by comparing with a range of dataset df (i.e includes many variables in addition to date from 2016-02-05 to 2018-07-12)- sampling interval is 15-minutes (i.e one day is 96 rows).

In my case, in addition to these values shown in the table, I want to have the values that do not belong to Start and End of these dates to be considered Non_Break days.

Following the steps in the link mentioned above this is the modified version of the code in R:

Break_data$Start <- ymd(Break_data$Start)
Break_data$End <- ymd(Break_data$End)
df$date <- ymd(df$date)

LU <-  Map(`:`, Break_data$Start, Break_data$End)
LU <- data.frame(value = unlist(LU),
                 index = rep(seq_along(LU), lapply(LU, length)))

df$Break <- Break_data$Break[LU$index[match(df$date, LU$value)]]

I presume in addition to this I have to provide Non_Break in a for loop or simple if function to the period of times that are not within start and end ranges.

Edit: I attempted in two different ways:

FIRST- without using the mapping

for (i in c(1:nrow(df))){
  if (df$date[i] >= "2016-02-24" & df$date <= "2016-02-29")
    df$Break[i]<-"Spring_Break"
  else if (df$date[i] >= "2016-03-23" & df$date <= "2016-03-28")
    df$Break[i]<-"Easter_Recess"
  else if (df$date[i] >= "2016-10-05" & df$date <= "2016-10-10")
    df$Break[i]<-"Mid_Term_Break"
  else if (df$date[i] >= "2017-03-01" & df$date <= "2017-03-06")
    df$Break[i]<-"Spring_Break"
  else if (df$date[i] >= "2017-04-12" & df$date <= "2017-04-17")
    df$Break[i]<-"Easter_Recess"
  else if (df$date[i] >= "2017-10-04" & df$date <= "2017-10-09")
    df$Break[i]<-"Mid_Term_Break"
  else if (df$date[i] >= "2018-02-28" & df$date <= "2018-03-05")
    df$Break[i]<-"Easter_Recess"
  else if (df$date[i] >= "2018-03-28" & df$date <= "2018-04-02")
    df$Break[i]<-"Easter_Recess"
  else (df$Break[i]<-"Not_Break")
}

The first one is running forever :) and I am getting 2 values Not_Break and Spring_Break.

And this is the Warning message:

Warning messages:
1: In if (df$date[i] >= "2016-02-24" & df$date <= "2016-02-29") df$Break[i] <- "Spring_Break" else if (df$date[i] >=  ... :
  the condition has length > 1 and only the first element will be used
2: In if (df$date[i] >= "2016-03-23" & df$date <= "2016-03-28") df$Break[i] <- "Easter_Recess" else if (df$date[i] >=  ... :
  the condition has length > 1 and only the first element will be used
3: In if (df$date[i] >= "2016-10-05" & df$date <= "2016-10-10") df$Break[i] <- "Mid_Term_Break" else if (df$date[i] >=  ... :
  the condition has length > 1 and only the first element will be used
4: In if (df$date[i] >= "2017-03-01" & df$date <= "2017-03-06") df$Break[i] <- "Spring_Break" else if (df$date[i] >=  ... :
  the condition has length > 1 and only the first element will be used
5: In if (df$date[i] >= "2017-04-12" & df$date <= "2017-04-17") df$Break[i] <- "Easter_Recess" else if (df$date[i] >=  ... :
  the condition has length > 1 and only the first element will be used
6: In if (df$date[i] >= "2017-10-04" & df$date <= "2017-10-09") df$Break[i] <- "Mid_Term_Break" else if (df$date[i] >=  ... :
  the condition has length > 1 and only the first element will be used
7: In if (df$date[i] >= "2018-02-28" & df$date <= "2018-03-05") df$Break[i] <- "Easter_Recess" else if (df$date[i] >=  ... :
  the condition has length > 1 and only the first element will be used
8: In if (df$date[i] >= "2018-03-28" & df$date <= "2018-04-02") df$Break[i] <- "Easter_Recess" else (df$Break[i] <- "Not_Break") :
  the condition has length > 1 and only the first element will be used
9: In if (df$date[i] >= "2016-02-24" & df$date <= "2016-02-29") df$Break[i] <- "Spring_Break" else if (df$date[i] >=  ... :
  the condition has length > 1 and only the first element will be used
10: In if (df$date[i] >= "2016-03-23" & df$date <= "2016-03-28") df$Break[i] <- "Easter_Recess" else if (df$date[i] >=  ... :
  the condition has length > 1 and only the first element will be used
11: In if (df$date[i] >= "2016-10-05" & df$date <= "2016-10-10") df$Break[i] <- "Mid_Term_Break" else if (df$date[i] >=  ... :
  the condition has length > 1 and only the first element will be used
12: In if (df$date[i] >= "2017-03-01" & df$date <= "2017-03-06") df$Break[i] <- "Spring_Break" else if (df$date[i] >=  ... :
  the condition has length > 1 and only the first element will be used
13: In if (df$date[i] >= "2017-04-12" & df$date <= "2017-04-17") df$Break[i] <- "Easter_Recess" else if (df$date[i] >=  ... :
  the condition has length > 1 and only the first element will be used
14: In if (df$date[i] >= "2017-10-04" & df$date <= "2017-10-09") df$Break[i] <- "Mid_Term_Break" else if (df$date[i] >=  ... :
  the condition has length > 1 and only the first element will be used
15: In if (df$date[i] >= "2018-02-28" & df$date <= "2018-03-05") df$Break[i] <- "Easter_Recess" else if (df$date[i] >=  ... :
  the condition has length > 1 and only the first element will be used
16: In if (df$date[i] >= "2018-03-28" & df$date <= "2018-04-02") df$Break[i] <- "Easter_Recess" else (df$Break[i] <- "Not_Break") :
  the condition has length > 1 and only the first element will be used
17: In if (df$date[i] >= "2016-02-24" & df$date <= "2016-02-29") df$Break[i] <- "Spring_Break" else if (df$date[i] >=  ... :
  the condition has length > 1 and only the first element will be used
18: In if (df$date[i] >= "2016-03-23" & df$date <= "2016-03-28") df$Break[i] <- "Easter_Recess" else if (df$date[i] >=  ... :
  the condition has length > 1 and only the first element will be used
19: In if (df$date[i] >= "2016-10-05" & df$date <= "2016-10-10") df$Break[i] <- "Mid_Term_Break" else if (df$date[i] >=  ... :
  the condition has length > 1 and only the first element will be used
20: In if (df$date[i] >= "2017-03-01" & df$date <= "2017-03-06") df$Break[i] <- "Spring_Break" else if (df$date[i] >=  ... :
  the condition has length > 1 and only the first element will be used
21: In if (df$date[i] >= "2017-04-12" & df$date <= "2017-04-17") df$Break[i] <- "Easter_Recess" else if (df$date[i] >=  ... :
  the condition has length > 1 and only the first element will be used
22: In if (df$date[i] >= "2017-10-04" & df$date <= "2017-10-09") df$Break[i] <- "Mid_Term_Break" else if (df$date[i] >=  ... :
  the condition has length > 1 and only the first element will be used
23: In if (df$date[i] >= "2018-02-28" & df$date <= "2018-03-05") df$Break[i] <- "Easter_Recess" else if (df$date[i] >=  ... :
  the condition has length > 1 and only the first element will be used
24: In if (df$date[i] >= "2018-03-28" & df$date <= "2018-04-02") df$Break[i] <- "Easter_Recess" else (df$Break[i] <- "Not_Break") :
  the condition has length > 1 and only the first element will be used
25: In if (df$date[i] >= "2016-02-24" & df$date <= "2016-02-29") df$Break[i] <- "Spring_Break" else if (df$date[i] >=  ... :
  the condition has length > 1 and only the first element will be used
26: In if (df$date[i] >= "2016-03-23" & df$date <= "2016-03-28") df$Break[i] <- "Easter_Recess" else if (df$date[i] >=  ... :
  the condition has length > 1 and only the first element will be used
27: In if (df$date[i] >= "2016-10-05" & df$date <= "2016-10-10") df$Break[i] <- "Mid_Term_Break" else if (df$date[i] >=  ... :
  the condition has length > 1 and only the first element will be used
28: In if (df$date[i] >= "2017-03-01" & df$date <= "2017-03-06") df$Break[i] <- "Spring_Break" else if (df$date[i] >=  ... :
  the condition has length > 1 and only the first element will be used
29: In if (df$date[i] >= "2017-04-12" & df$date <= "2017-04-17") df$Break[i] <- "Easter_Recess" else if (df$date[i] >=  ... :
  the condition has length > 1 and only the first element will be used
30: In if (df$date[i] >= "2017-10-04" & df$date <= "2017-10-09") df$Break[i] <- "Mid_Term_Break" else if (df$date[i] >=  ... :
  the condition has length > 1 and only the first element will be used
31: In if (df$date[i] >= "2018-02-28" & df$date <= "2018-03-05") df$Break[i] <- "Easter_Recess" else if (df$date[i] >=  ... :
  the condition has length > 1 and only the first element will be used
32: In if (df$date[i] >= "2018-03-28" & df$date <= "2018-04-02") df$Break[i] <- "Easter_Recess" else (df$Break[i] <- "Not_Break") :
  the condition has length > 1 and only the first element will be used
33: In if (df$date[i] >= "2016-02-24" & df$date <= "2016-02-29") df$Break[i] <- "Spring_Break" else if (df$date[i] >=  ... :
  the condition has length > 1 and only the first element will be used
34: In if (df$date[i] >= "2016-03-23" & df$date <= "2016-03-28") df$Break[i] <- "Easter_Recess" else if (df$date[i] >=  ... :
  the condition has length > 1 and only the first element will be used
35: In if (df$date[i] >= "2016-10-05" & df$date <= "2016-10-10") df$Break[i] <- "Mid_Term_Break" else if (df$date[i] >=  ... :
  the condition has length > 1 and only the first element will be used
36: In if (df$date[i] >= "2017-03-01" & df$date <= "2017-03-06") df$Break[i] <- "Spring_Break" else if (df$date[i] >=  ... :
  the condition has length > 1 and only the first element will be used
37: In if (df$date[i] >= "2017-04-12" & df$date <= "2017-04-17") df$Break[i] <- "Easter_Recess" else if (df$date[i] >=  ... :
  the condition has length > 1 and only the first element will be used
38: In if (df$date[i] >= "2017-10-04" & df$date <= "2017-10-09") df$Break[i] <- "Mid_Term_Break" else if (df$date[i] >=  ... :
  the condition has length > 1 and only the first element will be used
39: In if (df$date[i] >= "2018-02-28" & df$date <= "2018-03-05") df$Break[i] <- "Easter_Recess" else if (df$date[i] >=  ... :
  the condition has length > 1 and only the first element will be used
40: In if (df$date[i] >= "2018-03-28" & df$date <= "2018-04-02") df$Break[i] <- "Easter_Recess" else (df$Break[i] <- "Not_Break") :
  the condition has length > 1 and only the first element will be used
41: In if (df$date[i] >= "2016-02-24" & df$date <= "2016-02-29") df$Break[i] <- "Spring_Break" else if (df$date[i] >=  ... :
  the condition has length > 1 and only the first element will be used
42: In if (df$date[i] >= "2016-03-23" & df$date <= "2016-03-28") df$Break[i] <- "Easter_Recess" else if (df$date[i] >=  ... :
  the condition has length > 1 and only the first element will be used
43: In if (df$date[i] >= "2016-10-05" & df$date <= "2016-10-10") df$Break[i] <- "Mid_Term_Break" else if (df$date[i] >=  ... :
  the condition has length > 1 and only the first element will be used
44: In if (df$date[i] >= "2017-03-01" & df$date <= "2017-03-06") df$Break[i] <- "Spring_Break" else if (df$date[i] >=  ... :
  the condition has length > 1 and only the first element will be used
45: In if (df$date[i] >= "2017-04-12" & df$date <= "2017-04-17") df$Break[i] <- "Easter_Recess" else if (df$date[i] >=  ... :
  the condition has length > 1 and only the first element will be used
46: In if (df$date[i] >= "2017-10-04" & df$date <= "2017-10-09") df$Break[i] <- "Mid_Term_Break" else if (df$date[i] >=  ... :
  the condition has length > 1 and only the first element will be used
47: In if (df$date[i] >= "2018-02-28" & df$date <= "2018-03-05") df$Break[i] <- "Easter_Recess" else if (df$date[i] >=  ... :
  the condition has length > 1 and only the first element will be used
48: In if (df$date[i] >= "2018-03-28" & df$date <= "2018-04-02") df$Break[i] <- "Easter_Recess" else (df$Break[i] <- "Not_Break") :
  the condition has length > 1 and only the first element will be used
49: In if (df$date[i] >= "2016-02-24" & df$date <= "2016-02-29") df$Break[i] <- "Spring_Break" else if (df$date[i] >=  ... :
  the condition has length > 1 and only the first element will be used
50: In if (df$date[i] >= "2016-03-23" & df$date <= "2016-03-28") df$Break[i] <- "Easter_Recess" else if (df$date[i] >=  ... :
  the condition has length > 1 and only the first element will be used

SECOND - adding to the code in the link:

LU <-  Map(`:`, Break_data$Start, Break_data$End)
LU <- data.frame(value = unlist(LU),
                 index = rep(seq_along(LU), lapply(LU, length)))

for (i in c(1:nrow(df))){
  if (df$Break <- Break_data$Break[LU$index[match(df$date, LU$value)]])
  else (df$date[i] >= "2016-02-05" & df$date <= "2018-07-12")
  df$Break[i]<-"Not_Break"
}

in the second one also I am getting an error. Any modification to the code or Implementation (in R or Python) will be appreciated

is there any more efficient way to do this?

Note: the datasets are publically available at: https://github.com/tomiscat/data

Has QUIT--Anony-Mousse
  • 73,503
  • 12
  • 131
  • 189
King Julien
  • 159
  • 17
  • One other idea is to create a data.table with the "break period" dates as V1 and the factor name as V2 and then join Break_data and df as data.table (and fill the NAs with any other factor value you want). I tried working on that but it's too late now and I'm buffled on why `mapply(seq.Date, as.Date(c('2016-02-24', '2016-02-24')), c(as.Date('2016-02-26'), as.Date('2016-03-24')), MoreArgs = list(by = 1))` works, but not using the Dates directly from the `data.table(df)`. If you 'd like me to work on that let me know, but I doubt I'll have time the following days. – Konstantinos Aug 20 '18 at 01:51
  • Using `%within%` and `interval()` from `lubridate` package, could also be another idea, but for huge (>GB) data sets it'd take more time. – Konstantinos Aug 20 '18 at 01:54
  • -- with respect to your comment - https://stackoverflow.com/questions/22125224/add-column-to-data-frame-testing-categorical-variable-in-other-column/22127532?noredirect=1#comment90778197_22127532 – Konstantinos Aug 20 '18 at 02:00

1 Answers1

1
library(lubridate)

# data
Break_data <- data.table::fread(
" Start        End          Break       Year
 2016-02-24 2016-02-29   Spring_Break 2016
 2016-03-23 2016-03-28  Easter_Recess 2016
 2016-10-05 2016-10-10 Mid_Term_Break 2016
 2017-03-01 2017-03-06   Spring_Break 2017
 2017-04-12 2017-04-17  Easter_Recess 2017
 2017-10-04 2017-10-09 Mid_Term_Break 2017
 2018-02-28 2018-03-05   Spring_Break 2018
 2018-03-28 2018-04-02  Easter_Recess 2018"
)
df <- data.frame(
  date = c("2016-02-05","2016-02-05", "2016-02-05" ,"2016-02-05", "2016-02-05", "2016-02-05",
           "2016-02-26", "2016-10-07", "2018-03-30",
            "2018-07-12","2018-07-12", "2018-07-12", "2018-07-12", "2018-07-12" ,"2018-07-12")
)

# mapping

Break_data$Start <- ymd(Break_data$Start)
Break_data$End <- ymd(Break_data$End)
df$date <- ymd(df$date)
LU <-  Map(`:`, Break_data$Start, Break_data$End)
LU <- data.frame(value = unlist(LU),
                 index = rep(seq_along(LU), lapply(LU, length)))
df$Break <- Break_data$Break[LU$index[match(df$date, LU$value)]]


# if not mapped(df$Break ==NA), then set it to "Non_break"
df$Break <- ifelse(is.na(df$Break), "Non_Break", df$Break)
df$Break <- factor(df$Break)
df
#>          date          Break
#> 1  2016-02-05      Non_Break
#> 2  2016-02-05      Non_Break
#> 3  2016-02-05      Non_Break
#> 4  2016-02-05      Non_Break
#> 5  2016-02-05      Non_Break
#> 6  2016-02-05      Non_Break
#> 7  2016-02-26   Spring_Break
#> 8  2016-10-07 Mid_Term_Break
#> 9  2018-03-30  Easter_Recess
#> 10 2018-07-12      Non_Break
#> 11 2018-07-12      Non_Break
#> 12 2018-07-12      Non_Break
#> 13 2018-07-12      Non_Break
#> 14 2018-07-12      Non_Break
#> 15 2018-07-12      Non_Break

Created on 2018-08-19 by the reprex package (v0.2.0).

Edit: full solution

King Julien
  • 159
  • 17
TC Zhang
  • 2,667
  • 1
  • 12
  • 19