Simple but not easy merge task

Question

I have two incomplete dataframes (df_a, df_b): Columns are missing or NA values. "by" is the merge index and df_a has "priority" over df_b.

df_a = structure(list(Datum = structure(c(1635163200, 1635166800, 1635170400, 1635174000), class = c("POSIXct", "POSIXt")), Vorhersage = c(10.297922, 10.155121, 10.044135, 9.699513), Export = c("10.912", "10.47", NA, NA), color = c("rgb(0,128,0)", "rgb(0,128,0)", NA, NA), Status = c("ok", "ok", NA, NA), Plausibilität = c("4", "4", NA, NA), min = c(7.93000000000001, 9.4, 8.7, 8.3), max = c(12.31715325, 12.42822725, 12.51326325, 12.28620625)), row.names = c(NA, -4L), class = "data.frame")
df_b = structure(list(Datum = structure(c(1632510000, 1632513600, 1632517200, 1632520800), class = c("POSIXct", "POSIXt")), Vorhersage = c(14.821988, 14.832919, 14.706179, 14.573527), Referenz = c(16.6, 16.2, 15.9, 16), DWD_Name = c("Elpersbüttel", "Elpersbüttel", "Elpersbüttel", "Elpersbüttel"), Export = c(17.198, 16.713, 16.378, 16.358), color = c("rgb(0,128,0)", "rgb(0,128,0)", "rgb(0,128,0)", "rgb(0,128,0)"), Status = c("ok", "ok", "ok", "ok"), Plausibilität = c(4, 4, 4, 4), min = c(13.05, 12.808, 11.631891, 12.312), max = c(17, 17, 16.9, 16.7)), row.names = c(NA, -4L), class = "data.frame")

desired output is:

                Datum Vorhersage Export        color Status Plausibilität  min max Referenz
1 2021-10-25 14:00:00       10.3 10.912 rgb(0,128,0)     ok             4  7.9  12       NA
2 2021-10-25 15:00:00       10.2  10.47 rgb(0,128,0)     ok             4  9.4  12       NA
3 2021-10-25 16:00:00       10.0   <NA>         <NA>   <NA>          <NA>  8.7  13       NA
4 2021-10-25 17:00:00        9.7   <NA>         <NA>   <NA>          <NA>  8.3  12       NA
5 2021-09-24 21:00:00       14.8 17.198 rgb(0,128,0)     ok             4 13.1  17       17
6 2021-09-24 22:00:00       14.8 16.713 rgb(0,128,0)     ok             4 12.8  17       16
7 2021-09-24 23:00:00       14.7 16.378 rgb(0,128,0)     ok             4 11.6  17       16
8 2021-09-25 00:00:00       14.6 16.358 rgb(0,128,0)     ok             4 12.3  17       16
      DWD_Name
1         <NA>
2         <NA>
3         <NA>
4         <NA>
5 Elpersbüttel
6 Elpersbüttel
7 Elpersbüttel
8 Elpersbüttel

# for rebuild:
structure(list(Datum = structure(c(1635163200, 1635166800, 1635170400, 
1635174000, 1632510000, 1632513600, 1632517200, 1632520800), class = c("POSIXct", 
"POSIXt")), Vorhersage = c(10.297922, 10.155121, 10.044135, 9.699513, 
14.821988, 14.832919, 14.706179, 14.573527), Export = c("10.912", 
"10.47", NA, NA, "17.198", "16.713", "16.378", "16.358"), color = c("rgb(0,128,0)", 
"rgb(0,128,0)", NA, NA, "rgb(0,128,0)", "rgb(0,128,0)", "rgb(0,128,0)", 
"rgb(0,128,0)"), Status = c("ok", "ok", NA, NA, "ok", "ok", "ok", 
"ok"), Plausibilität = c("4", "4", NA, NA, "4", "4", "4", "4"
), min = c(7.93000000000001, 9.4, 8.7, 8.3, 13.05, 12.808, 11.631891, 
12.312), max = c(12.31715325, 12.42822725, 12.51326325, 12.28620625, 
17, 17, 16.9, 16.7), Referenz = c(NA, NA, NA, NA, 16.6, 16.2, 
15.9, 16), DWD_Name = c(NA, NA, NA, NA, "Elpersbüttel", "Elpersbüttel", 
"Elpersbüttel", "Elpersbüttel")), row.names = c(NA, -8L), class = "data.frame")

Thanks to the help of @r2evans I tried the following:


by = "Datum"

library(data.table)
colnms <- setdiff(intersect(names(df_a), names(df_b)), by)

setDT(df_a)
setDT(df_b)

merge(df_a, df_b, by = by, all = TRUE
)[, (colnms) := lapply(colnms, function(nm) fcoalesce(.SD[[paste0(nm, ".x")]], .SD[[paste0(nm, ".y")]]))
  ][, c(outer(colnms, c(".x", ".y"), paste0)) := NULL ][]

but I get the following error:

 Error in fcoalesce(.SD[[paste0(nm, ".x")]], .SD[[paste0(nm, ".y")]]) : 
  Item 2 is type double but the first item is type character. Please coerce

@KarthikS, your answer is correct, I think you should undelete it. — r2evans, Oct 19 '21 at 15:23
What does it mean that `df_a` has priority? Why is do you take the second number in c(2,7,NA,2) from `df_b` when it is available in `df_a`? — sindri_baldur, Oct 19 '21 at 16:45
@r2evans: ran into trouble again.. my function works, but your solution is faster.. I updated the example maybe you can help. — Nimrookie, Oct 26 '21 at 12:14
I think the error is clear: you are potentially corrupting your data by trying to merge not-the-same class vectors. For instance, `Export` and `Plausibilität` are `character` in `df_a` and `numeric` in `df_b`. My guess is that `df_a`'s of those two columns are supposed to be `numeric`. If you fix that with `df_a[,(c("Export","Plausibilität")) := lapply(.SD, as.numeric), .SDcols=c("Export","Plausibilität")]`, the merge command works. Pay attention to your classes, it makes a difference. (I usually do not trust merging functions to know how to coerce different classes.) — r2evans, Oct 26 '21 at 12:26
If one of the answers addresses your question, please [accept it](https://stackoverflow.com/help/someone-answers); doing so not only provides a little perk to the answerer with some points, but also provides some closure for readers with similar questions. Though you can only accept one answer, you have the option to up-vote as many as you think are helpful. (If there are still issues, you will likely need to edit your question with further details.) — r2evans, Oct 27 '21 at 21:09

r2evans · Answer 1 · 2021-10-23T22:57:33.553

Most of the other answers are good, but many either over-complicate the result (in my opinion) or they perform a left or right join, not the full join as expected in the OP.

Here's a quick solution that uses dynamic column names.

library(data.table)
colnms <- setdiff(intersect(names(df_a), names(df_b)), "by")
colnms
# [1] "a"

setDT(df_a)
setDT(df_b)

merge(df_a, df_b, by = "by", all = TRUE
  )[, (colnms) := lapply(colnms, function(nm) fcoalesce(.SD[[paste0(nm, ".x")]], .SD[[paste0(nm, ".y")]]))
  ][, c(outer(colnms, c(".x", ".y"), paste0)) := NULL ][]
#       by     b     c     a
#    <num> <num> <num> <num>
# 1:     1     1    NA     1
# 2:     2    NA     2     2
# 3:     3     3     3     3
# 4:     4    NA     4     4

Notes:

the normal data.table::[ merge is a left-join only, so we need to use data.table::merge in order to be able to get a full-join with all=TRUE;
because it's using merge, the repeated columns get the .x and .y suffixes, something we can easily capitalize on;
the canonical and most-performant way when using (colnms) := ... is to also include .SDcols=colnms, but that won't work as well here since we need the suffixed columns, not the colnms columns themselves; this is a slight performance penalty but certainly not an anti-pattern (I believe) given what we need to do; and since we could have more than one duplicate column, we have to be careful to do it with each pair at a time, not all of them at once;
the last [-block (using outer) is for removing the duplicate columns; without it, the output would have column names c("by", "a.x", "b", "a.y", "c", "a"). It uses outer because that's a straight-forward way to get 1-or-more colnms and combine .x and .y to each of them; it then uses data.table's := NULL shortcut for removing one-or-more columns.

Thank you very much @r2evans for this very informative, enlightning and pedagogical response. In only three lines of code, I learned a lot! I really appreciate the sharing, the pedagogical effort... and, incidentally, I measure my margin of progress which is tremendous !!! ;-) — lovalery, Oct 24 '21 at 17:44
r2evans: Got the following error: Error in `[.data.table`(merge(df_a, df_b, by = by, all = TRUE)[, `:=`((colnms), : (converted from warning) Column 'Datum.x' does not exist to remove — Nimrookie, Oct 25 '21 at 14:26
Okay, I understand. It appears that the issue is that your sample data is not representative (enough). Please update your data to something that can reproduce this error you have. — r2evans, Oct 25 '21 at 14:29

lovalery · Answer 2 · 2021-10-19T17:40:26.840

0

Not the answer with R base. But one possible solution with the package data.table

library(data.table)

setDT(df_a)
setDT(df_b)

df_a <- rbind(df_a, list(4, NA, NA))
df_b <- rbind(list(1, NA, NA), df_b)


df_a[df_b, `:=` (a = fifelse(is.na(a), i.a, a), c = c), on = .(by)][]
#>    by a  b  c
#> 1:  1 1  1 NA
#> 2:  2 2 NA  2
#> 3:  3 3  3  3
#> 4:  4 4 NA  4

Edit with the help of @r2evans, A much more elegant and efficient solution:

df_a[df_b, `:=` (a = fcoalesce(a, i.a), c = c), on = .(by)][]
#>    by a  b  c
#> 1:  1 1  1 NA
#> 2:  2 2 NA  2
#> 3:  3 3  3  3
#> 4:  4 4 NA  4

^{Created on 2021-10-19 by the reprex package (v2.0.1)}

edited Oct 19 '21 at 17:40

answered Oct 19 '21 at 15:43

lovalery

4,254
3
13
27

1

`data.table::fcoalesce(a, i.a)` is likely faster than `fifelse(...)` and made for this situation. – r2evans Oct 19 '21 at 17:12
1

Thank you so much @r2evans. To be honest I didn't know this function... magic! So, I will edit the answer. – lovalery Oct 19 '21 at 17:20
@lovalery, r2evans: Fast solution! Thanks! Is there a way to let it work with dynamic col names? – Nimrookie Oct 20 '21 at 21:42
Hi Nimrookie, I'm not sure if this is what you are looking for. But I'm giving it a try! I'm not very experienced in this, but I think you have to abandon the functional form of the code to benefit from dynamic column names. Therefore, in the example you provided, I would write the code as follows: `colnames_vector – lovalery Oct 23 '21 at 13:52

Adam · Answer 3 · 2021-10-19T16:26:59.220

This isn't the most elegant, but you can make a function that applies your rule to coalesce the values if they occur in both data frames.

# find the unique column names (not called "by")
cols <- union(names(df_a),names(df_b))
cols <- cols[!(cols == "by")]

# merge the data sets
df_merge <- merge(df_a, df_b, by = "by", all = TRUE)

# function to check for the base column names that now have a '.x' and
# a '.y' version. for the columns, fill in the NAs from '.x' with the
# value from '.y'
col_val <- function(col_base, df) {
  
  x <- names(df)
  if (all(paste0(col_base, c(".x", ".y")) %in% x)) {
    na.x <- is.na(df[[paste0(col_base, ".x")]])
    df[[paste0(col_base, ".x")]][na.x] <- df[[paste0(col_base, ".y")]][na.x]
    df[[paste0(col_base, ".x")]]
  } else {
    df[[col_base]]
  }
  
}

# apply this function to every column
cbind(df_merge["by"], sapply(cols, col_val, df = df_merge))

This will give the following result.

  by a  b  c
1  1 1  1 NA
2  2 2 NA  2
3  3 3  3  3
4  4 4 NA  4

I know you specified base, by the natural_join() function is worth mentioning.

library(rqdatatable)

natural_join(df_a, df_b, by = "by", jointype = "FULL")

This gives exactly what you want.

  by a  b  c
1  1 1  1 NA
2  2 2 NA  2
3  3 3  3  3
4  4 4 NA  4

score 0 · Answer 4 · answered Oct 21 '21 at 18:00

here a dynamic solution.. not bad, but maybe someone knows how to speed it up.

get_complete_df<-function(df_a,df_b, by = "by"){
  df_a  = unique(df_a)
  df_b  = unique(df_b)
  nam_a = names(df_a)[!(names(df_a) == by)] 
  nam_b = names(df_b)[!(names(df_b) == by)]
  nums_a = unlist(lapply(df_a, is.numeric)) 
  nums_b = unlist(lapply(df_b, is.numeric))
  nums   = unique(names(df_a)[nums_a],names(df_b)[nums_b])
  ## try to supplement NAs
  x = df_b[[by]][df_b[[by]] %in% df_a[[by]]]
  y = nam_b[nam_b %in% nam_a]
  vna = is.na(df_a[df_a[,1] %in% x,y]) 
  df_a[df_a[,1] %in% x ,y][vna] = df_b[df_b[,1] %in% x,y][vna]
  ## get complete df
  all_names  = c(nam_a,nam_b )
  all_names  = c(by, unique(all_names))
  all_by     = na.omit(unique(c(df_a[[by]],df_b[[by]]) ))
  ## build
  df_o = as.data.frame(matrix(nrow = length(all_by),ncol = length(all_names)))
  names(df_o) = all_names
  df_o[[by]]  = all_by
  ## fill in content 
  df_o[df_o[,1] %in% df_b[,1],names(df_b)] = df_b
  df_o[df_o[,1] %in% df_a[,1],names(df_a)] = df_a ## df_a has priority!
  # fix numeric:
  # why did some(!) num fields changed to chr ?
  df_o[,nums]   = as.data.frame(apply(df_o[,nums], 2, as.numeric))
  df_o
}

Simple but not easy merge task

4 Answers4