मैं केवल कॉमरेडिटी स्ट्रिंग्स के साथ एक नया कॉलम प्राप्त करने की कोशिश कर रहा हूं, और कोई भी श्रेणी नहीं। यह आर में साफ-सुथरी वरीयता के साथ किया जाता है। आप देखेंगे, 2 पंक्तियों में अजीब तार हैं जिनमें मुझे कोई दिलचस्पी नहीं है। मेरे पास इस प्रकार का डेटा है।

   structure(list(id = c("1", "2", "3", "4", "5", "6", "7", "9", 
"8", "10", "11", "12", "13", "14", "15", "16", "17", "18", "19", 
"20"), health_care_worker = c("No", "No", "No", "No", "Yes", 
"No", "No", "Yes", "No", "No", "No", "No", "No", "No", "No", 
"No", "No", "No", "No", "No"), how_unwell = c(1, 6, 1, 1, 1, 
6, 6, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1), health_cnd = c("None", 
NA, NA, "Diabetes Type 2,No,Yes,Yes,No,4,No,Showing Symptoms But Not Tested,Mild,Yes,No,No,No,Spanish,No, No,No,Yes,No 3bad24c8-0ac9-4269-aa53-5e8d41b03142,35,Female,Rio de Janeiro", 
NA, NA, NA, NA, "High Blood Pressure (hypertension),No,Yes,No,No,0,No,Self-Isolating With No Symptoms,None,No,No,No,No,Portuguese,No, No,No,No,Yes 2656b3f2-d916-43e1-96b2-1d371d8c7b12,58,Female,Belém/ Pará", 
NA, NA, NA, NA, NA, NA, NA, NA, "High Blood Pressure (hypertension),No,Yes,No,No,15,No,Showing Symptoms But Not Tested,Moderate,Yes,No,No,No,Spanish,No, No,No,Yes,No 41cf840a-cfcc-441f-a995-f6b75ecee967,22,Male,Agb,India,2020-08-04 05:25:00,No,No,No,No,No,No,No,1,None,N", 
NA, NA), health_1 = c("None", "None", "None", "Diabetes Type 2,No, Asthma, Yes,Yes,No,4,No,Showing Symptoms But Not Tested,Mild,Yes,No,No,No,Spanish,No, No,No,Yes,No 3bad24c8-0ac9-4269-aa53-5e8d41b03142,35,Female,Rio de Janeiro", 
"None", "None", "None", "None", "High Blood Pressure (hypertension),No, Obesity, Yes,No,No,0,No,Self-Isolating With No Symptoms,None,No,No,No,No,Portuguese,No, No,No,No,Yes 2656b3f2-d916-43e1-96b2-1d371d8c7b12,58,Female,Belém/ Pará", 
"None", "None", "None", "None", "None", "None", "None", "None", 
"High Blood Pressure (hypertension),No,Lung-condition, Yes,No,No,15,No,Showing Symptoms But Not Tested,Moderate,Yes,No,No,No,Spanish,No, No,No,Yes,No 41cf840a-cfcc-441f-a995-f6b75ecee967,22,Male,Agb,India,2020-08-04 05:25:00,No,No,No,No,No,No,No,1,None,N", 
"None", "None")), row.names = c(NA, -20L), class = c("tbl_df", 
"tbl", "data.frame"))

और इस तरह मैं अपना नया कॉलम चाहता हूं।

structure(list(id = c("1", "2", "3", "4", "5", "6", "7", "9", 
"8", "10", "11", "12", "13", "14", "15", "16", "17", "18", "19", 
"20"), health_care_worker = c("No", "No", "No", "No", "Yes", 
"No", "No", "Yes", "No", "No", "No", "No", "No", "No", "No", 
"No", "No", "No", "No", "No"), how_unwell = c(1, 6, 1, 1, 1, 
6, 6, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1), health_1 = c("None", 
"None", "None", "Diabetes Type 2,No, Asthma, Yes,Yes,No,4,No,Showing Symptoms But Not Tested,Mild,Yes,No,No,No,Spanish,No, No,No,Yes,No 3bad24c8-0ac9-4269-aa53-5e8d41b03142,35,Female,Rio de Janeiro", 
"None", "None", "None", "None", "High Blood Pressure (hypertension),No, Obesity, Yes,No,No,0,No,Self-Isolating With No Symptoms,None,No,No,No,No,Portuguese,No, No,No,No,Yes 2656b3f2-d916-43e1-96b2-1d371d8c7b12,58,Female,Belém/ Pará", 
"None", "None", "None", "None", "None", "None", "None", "None", 
"High Blood Pressure (hypertension),No,Lung-condition, Yes,No,No,15,No,Showing Symptoms But Not Tested,Moderate,Yes,No,No,No,Spanish,No, No,No,Yes,No 41cf840a-cfcc-441f-a995-f6b75ecee967,22,Male,Agb,India,2020-08-04 05:25:00,No,No,No,No,No,No,No,1,None,N", 
"None", "None"), copy_health_column = c("None", "None", "None", 
"Diabetes Type 2, Asthma", "None", "None", "None", "None", "High Blood Pressure (hypertension), Obesity", 
"None", "None", "None", "None", "None", "None", "None", "None", 
"High Blood Pressure (hypertension),Lung-condition", "None", 
"None")), row.names = c(NA, -20L), class = c("tbl_df", "tbl", 
"data.frame"))

अब, मेरे मूल डेटा में 100K से अधिक डेटा बिंदु हैं। इसलिए, मुझे उम्मीद है कि मुझे एक ऐसा समाधान मिलेगा जो बड़े डेटासेट के लिए लागू हो।

0
GaB 14 अगस्त 2020, 14:17

1 उत्तर

सबसे बढ़िया उत्तर
library(tidyverse)

df %>%
  rowwise() %>%
  mutate(copy_health_column =
           str_extract_all(health_1, pattern = "Diabetes Type 2|Asthma|Obesity|High Blood Pressure \\(hypertension\\)"),
         copy_health_column =  paste(copy_health_column, collapse = ","))
1
Jakub.Novotny 14 अगस्त 2020, 13:57