मेरे पास कारक चर है जो दो स्तंभों में होता है और अब मैं पहला अंतराल चाहता हूं, इससे कोई फर्क नहीं पड़ता कि आखिरी बार कौन सा स्तंभ कारक दिखाई दिया।

निम्नलिखित data.table पर विचार करें।

require(data.table)
set.seed(21)
dt <- data.table(item1 = c(rep(sample(letters[1:5]), 2), sample(letters[6:10])),
           item2 = c(rep(sample(letters[6:10]), 2), sample(letters[1:5])),
           value1 = rnorm(15, 5, 2),
           value2 = rnorm(15, 5, 2),
           iteration = rep(1:3, each = 5))
> dt
    item1 item2    value1   value2 iteration
 1:     d     i 0.4464375 6.491179         1
 2:     b     j 6.5148245 5.665638         1
 3:     c     f 3.9031889 2.751919         1
 4:     a     g 5.3450990 3.587738         1
 5:     e     h 6.1257061 3.544912         1
 6:     d     i 8.0236359 1.331371         2
 7:     b     j 6.3180503 4.184624         2
 8:     c     f 7.2440561 5.053722         2
 9:     a     g 3.4307173 6.823257         2
10:     e     h 4.1486154 8.268693         2
11:     j     a 5.7859952 5.121371         3
12:     f     c 5.0735143 8.695145         3
13:     i     e 2.9358327 5.160250         3
14:     g     d 2.4702771 7.837112         3
15:     h     b 4.5460694 7.917232         3

मैंने इसे data.table पैकेज के साथ हल करने का प्रयास किया है।

    dt[, lag1 := c(NA, value1), by = item1]
    dt[, lag2 := c(NA, value2), by = item2]
    dt
    item1 item2    value1   value2 iteration      lag1     lag2
 1:     d     i 0.4464375 6.491179         1        NA       NA
 2:     b     j 6.5148245 5.665638         1        NA       NA
 3:     c     f 3.9031889 2.751919         1        NA       NA
 4:     a     g 5.3450990 3.587738         1        NA       NA
 5:     e     h 6.1257061 3.544912         1        NA       NA
 6:     d     i 8.0236359 1.331371         2 0.4464375 6.491179
 7:     b     j 6.3180503 4.184624         2 6.5148245 5.665638
 8:     c     f 7.2440561 5.053722         2 3.9031889 2.751919
 9:     a     g 3.4307173 6.823257         2 5.3450990 3.587738
10:     e     h 4.1486154 8.268693         2 6.1257061 3.544912
11:     j     a 5.7859952 5.121371         3        NA       NA
12:     f     c 5.0735143 8.695145         3        NA       NA
13:     i     e 2.9358327 5.160250         3        NA       NA
14:     g     d 2.4702771 7.837112         3        NA       NA
15:     h     b 4.5460694 7.917232         3        NA       NA

मैं शायद आइटम के लिए एक कॉलम और मूल्य के लिए एक बनाकर इसे हल कर सकता हूं, लेकिन क्या कोई बेहतर समाधान है?

और स्पष्ट होने के लिए, पंक्ति 11 पर lag1 पर मेरा अपेक्षित मूल्य 4.184624 है।

मुझे आइटम 2 के लिए भी अंतराल की आवश्यकता होगी और यह dplyr किया।

dt %>% 
  mutate(nr = 1:nrow(dt)) %>% 
  gather(key, value, -nr, -iteration) %>% 
  mutate(key = ifelse(key == "item1" | key == "item2", "item", "value"),
         variabel = rep(c(1, 2), 2, each = nrow(dt))) %>% 
  spread(key, value) %>% 
  group_by(item) %>% 
  arrange(nr) %>%
  mutate(lag = lag(value)) %>% 
  gather(key, value, -iteration, -nr, -variabel) %>%
  unite(key, c("key", "variabel"), sep = "") %>% 
  spread(key, value)

   iteration    nr item1 item2             lag1              lag2            value1           value2
 *     <int> <int> <chr> <chr>            <chr>             <chr>             <chr>            <chr>
 1         1     1     e     f             <NA>              <NA>  4.48327811883486 5.98823833422944
 2         1     2     b     i             <NA>              <NA>  6.21252978898878  3.6803830789734
 3         1     3     d     g             <NA>              <NA>  5.62689643314086 7.00228385274896
 4         1     4     c     h             <NA>              <NA>  5.10720616395708 7.14416894881173
 5         1     5     a     j             <NA>              <NA>  7.25650757535391 6.51153141154262
 6         2     6     e     f 4.48327811883486  5.98823833422944  3.88373308164829 2.08907058913021
 7         2     7     b     i 6.21252978898878   3.6803830789734  8.07191789162847 6.88574195362948
 8         2     8     d     g 5.62689643314086  7.00228385274896  4.87510729533042 1.25944984673148
 9         2     9     c     h 5.10720616395708  7.14416894881173   5.0431504307243  4.4934555124612
10         2    10     a     j 7.25650757535391  6.51153141154262 0.820345123625779 4.41487625686153
11         3    11     g     d 1.25944984673148  4.87510729533042  3.37822264689098 5.43753611910662
12         3    12     j     a 4.41487625686153 0.820345123625779 -0.88757977661203 2.28986114731552
13         3    13     i     e 6.88574195362948  3.88373308164829  4.96240860503556 4.75454561215201
14         3    14     h     b  4.4934555124612  8.07191789162847  4.29063975464589 4.09626986248512
15         3    15     f     c 2.08907058913021   5.0431504307243  5.07114037497055 5.19449624162733
1
Tomas Ericsson 15 अप्रैल 2018, 23:30

2 जवाब

सबसे बढ़िया उत्तर

row number के लिए एक कॉलम जोड़ने के बाद melt का उपयोग करके समाधान निकाला जा सकता है।

library(data.table)

#Add a column to represent row number in current table. This will be used
#later to sort data.table to find correct `lag`
dt[,rn:=.I]

#Use melt to transfer values in long format. value1 contains "items" and 
#value2 contains "values" 
dt<-melt(dt, id=c("iteration","rn"), measure=list(1:2,c("value1","value2")),
value.name = c("item","value"))

#The order in original table is row number, item1 and item2. The same can be 
#achieved by sorting on "rn" and "variable"
dt[order(rn,variable), lag := shift(value), by = item]

dt[variable==1,][dt[variable==2,],.(item1 = item, item2 = i.item, value1 = value, 
        value2=i.value, iteration, lag1 = lag, lag2 = i.lag), on=("rn")]

#   item1 item2    value1   value2 iteration      lag1     lag2
# 1:     d     i 0.4464375 6.491179         1        NA       NA
# 2:     b     j 6.5148245 5.665638         1        NA       NA
# 3:     c     f 3.9031889 2.751919         1        NA       NA
# 4:     a     g 5.3450990 3.587738         1        NA       NA
# 5:     e     h 6.1257061 3.544912         1        NA       NA
# 6:     d     i 8.0236359 1.331371         2 0.4464375 6.491179
# 7:     b     j 6.3180503 4.184624         2 6.5148245 5.665638
# 8:     c     f 7.2440561 5.053722         2 3.9031889 2.751919
# 9:     a     g 3.4307173 6.823257         2 5.3450990 3.587738
# 10:    e     h 4.1486154 8.268693         2 6.1257061 3.544912
# 11:    j     a 5.7859952 5.121371         3 4.1846241 3.430717
# 12:    f     c 5.0735143 8.695145         3 5.0537224 7.244056
# 13:    i     e 2.9358327 5.160250         3 1.3313712 4.148615
# 14:    g     d 2.4702771 7.837112         3 6.8232573 8.023636
# 15:    h     b 4.5460694 7.917232         3 8.2686930 6.318050
3
MKR 18 अप्रैल 2018, 00:06

एक और समान दृष्टिकोण पोस्ट करना। आइटम 1 + आइटम 2 के विस्तारित संस्करण को लंबे डेटा.टेबल में उपयोग करने के समान। जॉइन का उपयोग करने में अंतर है।

2 संभावित स्थितियां हैं:

1) अंतराल हमेशा तत्काल पिछले पुनरावृत्ति में होता है, फिर सामान्य जुड़ाव का उपयोग करके निम्न कोड काम करना चाहिए:

library(data.table)
set.seed(21)
dt <- data.table(item1 = c(rep(sample(letters[1:5]), 2), sample(letters[6:10])),
    item2 = c(rep(sample(letters[6:10]), 2), sample(letters[1:5])),
    value1 = rnorm(15, 5, 2),
    value2 = rnorm(15, 5, 2),
    iteration = rep(1:3, each = 5))

#if that first lag can always be found in previous iteration
dt[.(iitem=c(item1, item2), ivalue=c(value1, value2), iiteration=c(iteration + 1, iteration + 1)),
    lag1 := ivalue,
    on=c(item1="iitem", iteration="iiteration")]
dt[.(iitem=c(item1, item2), ivalue=c(value1, value2), iiteration=c(iteration + 1, iteration + 1)),
    lag2 := ivalue,
    on=c(item2="iitem", iteration="iiteration")]
dt

#     item1 item2             value1            value2 iteration               lag1              lag2
#  1:     d     i 0.4464375195067456 6.491178609416053         1                 NA                NA
#  2:     b     j 6.5148244502509627 5.665638360665036         1                 NA                NA
#  3:     c     f 3.9031888919439428 2.751919085284464         1                 NA                NA
#  4:     a     g 5.3450989557007524 3.587738435542055         1                 NA                NA
#  5:     e     h 6.1257061355108435 3.544912270783058         1                 NA                NA
#  6:     d     i 8.0236359188753603 1.331371229451156         2 0.4464375195067456 6.491178609416053
#  7:     b     j 6.3180503383288116 4.184624119479032         2 6.5148244502509627 5.665638360665036
#  8:     c     f 7.2440561493491140 5.053722389597528         2 3.9031888919439428 2.751919085284464
#  9:     a     g 3.4307172617070858 6.823257275121762         2 5.3450989557007524 3.587738435542055
# 10:     e     h 4.1486154223793710 8.268692951017332         2 6.1257061355108435 3.544912270783058
# 11:     j     a 5.7859951827443368 5.121371228719468         3 4.1846241194790323 3.430717261707086
# 12:     f     c 5.0735142596491132 8.695145055731583         3 5.0537223895975281 7.244056149349114
# 13:     i     e 2.9358326775434151 5.160249909302514         3 1.3313712294511557 4.148615422379371
# 14:     g     d 2.4702770572371642 7.837111765957783         3 6.8232572751217617 8.023635918875360
# 15:     h     b 4.5460694295527579 7.917231870893728         3 8.2686929510173321 6.318050338328812

2) यदि अंतराल पहले के पुनरावृत्तियों में हो सकता है, तो गैर-इक्वि जॉइन का उपयोग करने वाले निम्नलिखित कोड को मदद करनी चाहिए

library(data.table)
set.seed(21)
dt <- data.table(item1 = c(rep(sample(letters[1:5]), 2), sample(letters[6:10])),
    item2 = c(rep(sample(letters[6:10]), 2), sample(letters[1:5])),
    value1 = rnorm(15, 5, 2),
    value2 = rnorm(15, 5, 2),
    iteration = rep(1:3, each = 5))
#remove iteration=2, item1=c, item2=f to show finding lag from earlier iterations
dt <- dt[-8,]

#if that first lag can be found in even earlier iteration, using non-equi joins as follows:
elongated <- dt[,.(item=c(item1, item2), ivalue=c(value1, value2), iteration=c(iteration, iteration), cpyalliter=c(iteration, iteration))]

dt[, lag1 := elongated[.SD, on=.(item=item1, iteration < iteration)][,
    last(ivalue), by=.(item1=item, item2, value1, value2, iteration)]$V1 ]

dt[, lag2 := elongated[.SD, on=.(item=item2, iteration < iteration)][,
    last(ivalue), by=.(item1, item2=item, value1, value2, iteration)]$V1 ]

dt

#     item1 item2             value1            value2 iteration               lag1              lag2
#  1:     d     i 0.4464375195067456 6.491178609416053         1                 NA                NA
#  2:     b     j 6.5148244502509627 5.665638360665036         1                 NA                NA
#  3:     c     f 3.9031888919439428 2.751919085284464         1                 NA                NA
#  4:     a     g 5.3450989557007524 3.587738435542055         1                 NA                NA
#  5:     e     h 6.1257061355108435 3.544912270783058         1                 NA                NA
#  6:     d     i 8.0236359188753603 1.331371229451156         2 0.4464375195067456 6.491178609416053
#  7:     b     j 6.3180503383288116 4.184624119479032         2 6.5148244502509627 5.665638360665036
#  8:     a     g 3.4307172617070858 6.823257275121762         2 5.3450989557007524 3.587738435542055
#  9:     e     h 4.1486154223793710 8.268692951017332         2 6.1257061355108435 3.544912270783058
# 10:     j     a 5.7859951827443368 5.121371228719468         3 4.1846241194790323 3.430717261707086
# 11:     f     c 5.0735142596491132 8.695145055731583         3 2.7519190852844644 3.903188891943943
# 12:     i     e 2.9358326775434151 5.160249909302514         3 1.3313712294511557 4.148615422379371
# 13:     g     d 2.4702770572371642 7.837111765957783         3 6.8232572751217617 8.023635918875360
# 14:     h     b 4.5460694295527579 7.917231870893728         3 8.2686929510173321 6.318050338328812

मुझे आश्चर्य है कि क्या दूसरा मामला अधिक संक्षेप में लिखने का कोई तरीका है (यानी थोड़ी कम श्रृंखला के साथ)

0
chinsoon12 17 अप्रैल 2018, 04:21