The first step was to collect data through a survey on Facebook trolling. I got a total of 57 responses.
## Parsed with column specification:
## cols(
## .default = col_character()
## )
## See spec(...) for full column specifications.
## # A tibble: 6 x 27
## StartDate EndDate Status IPAddress Progress `Duration (in s… Finished
## <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 Start Date End Date Respon… IP Addre… Progress Duration (in se… Finished
## 2 "{\"Impor… "{\"Imp… "{\"Im… "{\"Impo… "{\"Imp… "{\"ImportId\":… "{\"Imp…
## 3 2018-03-3… 2018-03… Survey… <NA> 100 6 True
## 4 2018-03-3… 2018-03… IP Add… 185.69.1… 100 3 True
## 5 2018-03-3… 2018-03… IP Add… 212.236.… 100 90 True
## 6 2018-03-3… 2018-03… IP Add… 78.92.3.… 100 39 True
## # ... with 20 more variables: RecordedDate <chr>, ResponseId <chr>,
## # RecipientLastName <chr>, RecipientFirstName <chr>,
## # RecipientEmail <chr>, ExternalReference <chr>, LocationLatitude <chr>,
## # LocationLongitude <chr>, DistributionChannel <chr>,
## # UserLanguage <chr>, Q1 <chr>, Q2 <chr>, Q2_3_TEXT <chr>, Q10 <chr>,
## # Q3 <chr>, Q5 <chr>, Q6 <chr>, Q7 <chr>, Q7_3_TEXT <chr>, Q9 <chr>
## # A tibble: 6 x 28
## id StartDate EndDate Status IPAddress Progress `Duration (in se…
## <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 1 2018-03-30… 2018-03-… Survey… <NA> 100 6
## 2 2 2018-03-30… 2018-03-… IP Add… 185.69.1… 100 3
## 3 3 2018-03-30… 2018-03-… IP Add… 212.236.… 100 90
## 4 4 2018-03-30… 2018-03-… IP Add… 78.92.3.… 100 39
## 5 5 2018-03-30… 2018-03-… IP Add… 79.68.16… 100 98
## 6 6 2018-03-30… 2018-03-… IP Add… 14.139.1… 100 55
## # ... with 21 more variables: Finished <chr>, RecordedDate <chr>,
## # ResponseId <chr>, RecipientLastName <chr>, RecipientFirstName <chr>,
## # RecipientEmail <chr>, ExternalReference <chr>, LocationLatitude <chr>,
## # LocationLongitude <chr>, DistributionChannel <chr>,
## # UserLanguage <chr>, Q1 <chr>, Q2 <chr>, Q2_3_TEXT <chr>, Q10 <chr>,
## # Q3 <chr>, Q5 <chr>, Q6 <chr>, Q7 <chr>, Q7_3_TEXT <chr>, Q9 <chr>
Let’s now see what the columns of our data set are. Note that the first 17 columns are metadata that Qualtrics inserted.
colnames(data)
## [1] "id" "StartDate"
## [3] "EndDate" "Status"
## [5] "IPAddress" "Progress"
## [7] "Duration (in seconds)" "Finished"
## [9] "RecordedDate" "ResponseId"
## [11] "RecipientLastName" "RecipientFirstName"
## [13] "RecipientEmail" "ExternalReference"
## [15] "LocationLatitude" "LocationLongitude"
## [17] "DistributionChannel" "UserLanguage"
## [19] "Q1" "Q2"
## [21] "Q2_3_TEXT" "Q10"
## [23] "Q3" "Q5"
## [25] "Q6" "Q7"
## [27] "Q7_3_TEXT" "Q9"
See how many people have experienced trolling
data %>% group_by(Q1) %>% count
## # A tibble: 2 x 2
## # Groups: Q1 [2]
## Q1 n
## <chr> <int>
## 1 No 20
## 2 Yes 37
ggplot(data, aes(data$Q1)) + geom_bar()
# create Q1_num
data <- data %>% mutate(Q1_num=as.numeric(Q1))
# show unsuccessful conversions
invalid_ids <- data %>% filter(is.na(Q1_num)) %>% pull(id)
data %>% filter(is.na(Q1_num)) %>% select(id,Q1,Q1_num)
## # A tibble: 57 x 3
## id Q1 Q1_num
## <chr> <chr> <dbl>
## 1 1 No NA
## 2 2 No NA
## 3 3 Yes NA
## 4 4 Yes NA
## 5 5 Yes NA
## 6 6 Yes NA
## 7 7 No NA
## 8 8 Yes NA
## 9 9 No NA
## 10 10 No NA
## # ... with 47 more rows
data %>% group_by(Q2) %>% count
## # A tibble: 3 x 2
## # Groups: Q2 [3]
## Q2 n
## <chr> <int>
## 1 Personally directed remark 13
## 2 Remark on a certain community or group that you belong to (it cou… 20
## 3 <NA> 24
ggplot(data, aes(data$Q2)) + geom_bar()
data %>% group_by(Q3) %>% count
## # A tibble: 6 x 2
## # Groups: Q3 [6]
## Q3 n
## <chr> <int>
## 1 1-3 times a day 13
## 2 10-12 times a day 15
## 3 I don't keep a track 2
## 4 Once a day 1
## 5 Once a week 2
## 6 <NA> 24
data %>% filter(Q3 != "") %>% ggplot(aes(Q3)) + geom_bar()
data %>% drop_na()
## # A tibble: 0 x 29
## # ... with 29 variables: id <chr>, StartDate <chr>, EndDate <chr>,
## # Status <chr>, IPAddress <chr>, Progress <chr>, `Duration (in
## # seconds)` <chr>, Finished <chr>, RecordedDate <chr>, ResponseId <chr>,
## # RecipientLastName <chr>, RecipientFirstName <chr>,
## # RecipientEmail <chr>, ExternalReference <chr>, LocationLatitude <chr>,
## # LocationLongitude <chr>, DistributionChannel <chr>,
## # UserLanguage <chr>, Q1 <chr>, Q2 <chr>, Q2_3_TEXT <chr>, Q10 <chr>,
## # Q3 <chr>, Q5 <chr>, Q6 <chr>, Q7 <chr>, Q7_3_TEXT <chr>, Q9 <chr>,
## # Q1_num <dbl>
data %>% group_by(Q10) %>% count
## # A tibble: 4 x 2
## # Groups: Q10 [4]
## Q10 n
## <chr> <int>
## 1 Maybe 4
## 2 No 18
## 3 Yes 11
## 4 <NA> 24
ggplot(data, aes(data$Q10)) + geom_bar()
heatmap_1 <- matrix(c(14,10,26,12,8,17),ncol=3,byrow=TRUE)
colnames(heatmap_1) <- c("PR","CR", "Report")
rownames(heatmap_1) <- c("Male","Female")
heatmap_1 <- as.table(heatmap_1)
heatmap_1
## PR CR Report
## Male 14 10 26
## Female 12 8 17
heat_matrix <- data.matrix(heatmap_1)
Here I was trying to create a Heatmap to see patterns between Male-Female and who reported more than the other, and if male or female faced personal remark/community remark more than the other person.
malefemale_heatmap <- heatmap(heat_matrix, Rowv=NA, Colv=NA, col = heat.colors(256), scale="none", margins=c(5,5))
I was interested in doing some kind of t test to see which group Male or Female spends more time on Facebook. To do that, I converted new column which stores average number of times people login to their Fb account. I created another column which has total time spent by people each time they logged in. I then multipiled the two to get the total time spent with male/female ratio. Then, I ran t test.
data <- data %>%
mutate(Q3_num = case_when(Q3 == "10-12 times a day" ~ "11",
Q3 == "Once a day" ~ "1",
Q3 == "1-3 times a day" ~ "1.5",
Q3 == "Once a week" ~ "0.14",
Q3 == "I don't keep a track" ~ "0",
Q3 == "" ~ "0",
Q3 == "NA" ~ "0"))
data <- data %>%
mutate(Q5_num = case_when(Q5 == "1-5 minutes (Quickly checking notifications)" ~ "2.5",
Q5 == "An hour" ~ "60",
Q5 == "1-3 times a day" ~ "1.5",
Q5 == "10-15 minutes" ~ "12.5",
Q5 == "It really depends on the day" ~ "0",
Q5 == "I have no idea" ~ "0",
Q5 == "" ~ "0",
Q5 == "NA" ~ "0"))
data <- data %>% drop_na(Q3_num) %>% mutate(Q3_num=as.numeric(Q3_num))
data <- data %>% drop_na(Q5_num) %>% mutate(Q5_num=as.numeric(Q5_num))
data <- data %>% mutate(total_time = Q3_num * Q5_num)
data %>% summarise(Q3_mean = mean(Q3_num),
Q3_median = median(Q3_num),
Q3_min = min(Q3_num),
Q3_max = max(Q3_num),
Q3_total = n(),
Q3_sd = sd(Q3_num))
## # A tibble: 1 x 6
## Q3_mean Q3_median Q3_min Q3_max Q3_total Q3_sd
## <dbl> <dbl> <dbl> <dbl> <int> <dbl>
## 1 5.90 1.50 0 11.0 31 5.04
ggplot(data %>% transmute(Q3_num=sqrt(Q3_num)), aes(Q3_num)) + geom_histogram(binwidth=1) #coord_cartesian(xlim=c(0,3100),ylim=c(0,15))
For those who only identify themselves as Females
females <- data %>% filter(Q7 == "Female") %>% mutate(Q3_num=log(Q3_num))
ggplot(females, aes(Q3_num)) + geom_histogram(binwidth=1)+ coord_cartesian(xlim=c(0,
10),ylim=c(0,10))
For those who only identify themselves as Mmales
males <- data %>% filter(Q7 == "Male") %>% mutate(Q3_num=log(Q3_num))
ggplot(males, aes(Q3_num)) + geom_histogram(binwidth=1)+ coord_cartesian(xlim=c(0,
10),ylim=c(0,10))
data %>% summarise(Q5_mean = mean(Q5_num),
Q5_median = median(Q5_num),
Q5_min = min(Q5_num),
Q5_max = max(Q5_num),
Q5_total = n(),
Q5_sd = sd(Q5_num))
## # A tibble: 1 x 6
## Q5_mean Q5_median Q5_min Q5_max Q5_total Q5_sd
## <dbl> <dbl> <dbl> <dbl> <int> <dbl>
## 1 14.9 2.50 0 60.0 31 22.9
ggplot(data %>% transmute(Q3_num=sqrt(Q3_num)), aes(Q3_num)) + geom_histogram(binwidth=1) #coord_cartesian(xlim=c(0,3100),ylim=c(0,15))
For those who qualify themselves as females
females <- data %>% filter(Q7 == "Female") %>% mutate(Q5_num=log(Q5_num))
ggplot(females, aes(Q5_num)) + geom_histogram(binwidth=1)+ coord_cartesian(xlim=c(0,
10),ylim=c(0,10))
For those who qualify themselves as males
males <- data %>% filter(Q7 == "Male") %>% mutate(Q5_num=log(Q5_num))
ggplot(males, aes(Q5_num)) + geom_histogram(binwidth=1)+ coord_cartesian(xlim=c(0,
10),ylim=c(0,10))
data %>% summarise(t_mean = mean(total_time),
t_median = median(total_time),
t_min = min(total_time),
t_max = max(total_time),
t_total = n(),
t_sd = sd(total_time))
## # A tibble: 1 x 6
## t_mean t_median t_min t_max t_total t_sd
## <dbl> <dbl> <dbl> <dbl> <int> <dbl>
## 1 112 18.8 0 660 31 219
ggplot(data %>% transmute(total_time=sqrt(total_time)), aes(total_time)) + geom_histogram(binwidth=1) #coord_cartesian(xlim=c(0,100),ylim=c(0,20))
Now running t test to see who uses more facebook between males and females
female <- data %>% filter(Q7 == "Female")
male <- data %>% filter(Q7 == "Male")
male_tt <- male %>% pull(total_time)
female_tt <- female %>% pull(total_time)
t.test(male_tt,female_tt,var.equal=TRUE)
##
## Two Sample t-test
##
## data: male_tt and female_tt
## t = -0.04528, df = 29, p-value = 0.9642
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -169.1646 161.8364
## sample estimates:
## mean of x mean of y
## 109.6692 113.3333
megaplot <- data %>% select(Q7,Q3_num, Q5_num, total_time)
print(megaplot)
## # A tibble: 31 x 4
## Q7 Q3_num Q5_num total_time
## <chr> <dbl> <dbl> <dbl>
## 1 Female 11.0 2.50 27.5
## 2 Female 11.0 60.0 660
## 3 Female 11.0 12.5 138
## 4 Male 1.00 0 0
## 5 Male 11.0 0 0
## 6 Male 0.140 2.50 0.350
## 7 Male 0 0 0
## 8 Male 11.0 60.0 660
## 9 Female 1.50 0 0
## 10 Female 1.50 60.0 90.0
## # ... with 21 more rows
library(GGally)
##
## Attaching package: 'GGally'
## The following object is masked from 'package:dplyr':
##
## nasa
ggparcoord(megaplot, columns = 1:4, groupColumn = 'Q7', scale = 'globalminmax')
## Warning in `[<-.factor`(`*tmp*`, ri, value = c(11, 11, 11, 1, 11, 0.14, :
## invalid factor level, NA generated
## Warning in `[<-.factor`(`*tmp*`, ri, value = c(11, 11, 11, 1, 11, 0.14, :
## invalid factor level, NA generated
## Warning in `[<-.factor`(`*tmp*`, ri, value = c(11, 11, 11, 1, 11, 0.14, :
## invalid factor level, NA generated
# group by combinations and count
#megaplot %>% group_by(Q7) %>% count()
# set an id string that denotes the value combination
megaplot %>% mutate(id = factor(paste(Q3_num, Q5_num, total_time, Q7, sep = '-')))
## # A tibble: 31 x 5
## Q7 Q3_num Q5_num total_time id
## <chr> <dbl> <dbl> <dbl> <fct>
## 1 Female 11.0 2.50 27.5 11-2.5-27.5-Female
## 2 Female 11.0 60.0 660 11-60-660-Female
## 3 Female 11.0 12.5 138 11-12.5-137.5-Female
## 4 Male 1.00 0 0 1-0-0-Male
## 5 Male 11.0 0 0 11-0-0-Male
## 6 Male 0.140 2.50 0.350 0.14-2.5-0.35-Male
## 7 Male 0 0 0 0-0-0-Male
## 8 Male 11.0 60.0 660 11-60-660-Male
## 9 Female 1.50 0 0 1.5-0-0-Female
## 10 Female 1.50 60.0 90.0 1.5-60-90-Female
## # ... with 21 more rows
megaplot2 <- megaplot %>% mutate(id = factor(paste(Q3_num, Q5_num, total_time, Q7, sep = '-')))
head(megaplot2, n=31)
## # A tibble: 31 x 5
## Q7 Q3_num Q5_num total_time id
## <chr> <dbl> <dbl> <dbl> <fct>
## 1 Female 11.0 2.50 27.5 11-2.5-27.5-Female
## 2 Female 11.0 60.0 660 11-60-660-Female
## 3 Female 11.0 12.5 138 11-12.5-137.5-Female
## 4 Male 1.00 0 0 1-0-0-Male
## 5 Male 11.0 0 0 11-0-0-Male
## 6 Male 0.140 2.50 0.350 0.14-2.5-0.35-Male
## 7 Male 0 0 0 0-0-0-Male
## 8 Male 11.0 60.0 660 11-60-660-Male
## 9 Female 1.50 0 0 1.5-0-0-Female
## 10 Female 1.50 60.0 90.0 1.5-60-90-Female
## # ... with 21 more rows
library(GGally)
ggparcoord(megaplot2, columns = 2:4, groupColumn = 'id', scale = 'globalminmax', missing = 'exclude', title = "Parallel Coordinate Plot for Male-Female total times logged in, time spent during each logged session and total time ")
mp3<- as.data.frame(megaplot2)
library(ggparallel)
ggparallel(list('Q7','total_time', 'Q3_num', 'Q5_num'), mp3, order = 0)
This is the end of results—————–