Introduction

The first step was to collect data through a survey on Facebook trolling. I got a total of 57 responses.

## Parsed with column specification:
## cols(
##   .default = col_character()
## )

## See spec(...) for full column specifications.

## # A tibble: 6 x 27
##   StartDate  EndDate  Status  IPAddress Progress `Duration (in s… Finished
##   <chr>      <chr>    <chr>   <chr>     <chr>    <chr>            <chr>   
## 1 Start Date End Date Respon… IP Addre… Progress Duration (in se… Finished
## 2 "{\"Impor… "{\"Imp… "{\"Im… "{\"Impo… "{\"Imp… "{\"ImportId\":… "{\"Imp…
## 3 2018-03-3… 2018-03… Survey… <NA>      100      6                True    
## 4 2018-03-3… 2018-03… IP Add… 185.69.1… 100      3                True    
## 5 2018-03-3… 2018-03… IP Add… 212.236.… 100      90               True    
## 6 2018-03-3… 2018-03… IP Add… 78.92.3.… 100      39               True    
## # ... with 20 more variables: RecordedDate <chr>, ResponseId <chr>,
## #   RecipientLastName <chr>, RecipientFirstName <chr>,
## #   RecipientEmail <chr>, ExternalReference <chr>, LocationLatitude <chr>,
## #   LocationLongitude <chr>, DistributionChannel <chr>,
## #   UserLanguage <chr>, Q1 <chr>, Q2 <chr>, Q2_3_TEXT <chr>, Q10 <chr>,
## #   Q3 <chr>, Q5 <chr>, Q6 <chr>, Q7 <chr>, Q7_3_TEXT <chr>, Q9 <chr>

## # A tibble: 6 x 28
##   id    StartDate   EndDate   Status  IPAddress Progress `Duration (in se…
##   <chr> <chr>       <chr>     <chr>   <chr>     <chr>    <chr>            
## 1 1     2018-03-30… 2018-03-… Survey… <NA>      100      6                
## 2 2     2018-03-30… 2018-03-… IP Add… 185.69.1… 100      3                
## 3 3     2018-03-30… 2018-03-… IP Add… 212.236.… 100      90               
## 4 4     2018-03-30… 2018-03-… IP Add… 78.92.3.… 100      39               
## 5 5     2018-03-30… 2018-03-… IP Add… 79.68.16… 100      98               
## 6 6     2018-03-30… 2018-03-… IP Add… 14.139.1… 100      55               
## # ... with 21 more variables: Finished <chr>, RecordedDate <chr>,
## #   ResponseId <chr>, RecipientLastName <chr>, RecipientFirstName <chr>,
## #   RecipientEmail <chr>, ExternalReference <chr>, LocationLatitude <chr>,
## #   LocationLongitude <chr>, DistributionChannel <chr>,
## #   UserLanguage <chr>, Q1 <chr>, Q2 <chr>, Q2_3_TEXT <chr>, Q10 <chr>,
## #   Q3 <chr>, Q5 <chr>, Q6 <chr>, Q7 <chr>, Q7_3_TEXT <chr>, Q9 <chr>

2. Data Columns

Let’s now see what the columns of our data set are. Note that the first 17 columns are metadata that Qualtrics inserted.

colnames(data)

##  [1] "id"                    "StartDate"            
##  [3] "EndDate"               "Status"               
##  [5] "IPAddress"             "Progress"             
##  [7] "Duration (in seconds)" "Finished"             
##  [9] "RecordedDate"          "ResponseId"           
## [11] "RecipientLastName"     "RecipientFirstName"   
## [13] "RecipientEmail"        "ExternalReference"    
## [15] "LocationLatitude"      "LocationLongitude"    
## [17] "DistributionChannel"   "UserLanguage"         
## [19] "Q1"                    "Q2"                   
## [21] "Q2_3_TEXT"             "Q10"                  
## [23] "Q3"                    "Q5"                   
## [25] "Q6"                    "Q7"                   
## [27] "Q7_3_TEXT"             "Q9"

Facebook Analysis

See how many people have experienced trolling

data %>% group_by(Q1) %>% count

## # A tibble: 2 x 2
## # Groups:   Q1 [2]
##   Q1        n
##   <chr> <int>
## 1 No       20
## 2 Yes      37

ggplot(data, aes(data$Q1)) + geom_bar()

# create Q1_num
data <- data %>% mutate(Q1_num=as.numeric(Q1)) 

# show unsuccessful conversions
invalid_ids <-  data %>% filter(is.na(Q1_num)) %>% pull(id)
data %>% filter(is.na(Q1_num)) %>% select(id,Q1,Q1_num)

## # A tibble: 57 x 3
##    id    Q1    Q1_num
##    <chr> <chr>  <dbl>
##  1 1     No        NA
##  2 2     No        NA
##  3 3     Yes       NA
##  4 4     Yes       NA
##  5 5     Yes       NA
##  6 6     Yes       NA
##  7 7     No        NA
##  8 8     Yes       NA
##  9 9     No        NA
## 10 10    No        NA
## # ... with 47 more rows

data %>% group_by(Q2) %>% count

## # A tibble: 3 x 2
## # Groups:   Q2 [3]
##   Q2                                                                     n
##   <chr>                                                              <int>
## 1 Personally directed remark                                            13
## 2 Remark on a certain community or group that you belong to (it cou…    20
## 3 <NA>                                                                  24

ggplot(data, aes(data$Q2)) + geom_bar()

data %>% group_by(Q3) %>% count

## # A tibble: 6 x 2
## # Groups:   Q3 [6]
##   Q3                       n
##   <chr>                <int>
## 1 1-3 times a day         13
## 2 10-12 times a day       15
## 3 I don't keep a track     2
## 4 Once a day               1
## 5 Once a week              2
## 6 <NA>                    24

data %>% filter(Q3 != "") %>% ggplot(aes(Q3)) + geom_bar()

data %>% drop_na()

## # A tibble: 0 x 29
## # ... with 29 variables: id <chr>, StartDate <chr>, EndDate <chr>,
## #   Status <chr>, IPAddress <chr>, Progress <chr>, `Duration (in
## #   seconds)` <chr>, Finished <chr>, RecordedDate <chr>, ResponseId <chr>,
## #   RecipientLastName <chr>, RecipientFirstName <chr>,
## #   RecipientEmail <chr>, ExternalReference <chr>, LocationLatitude <chr>,
## #   LocationLongitude <chr>, DistributionChannel <chr>,
## #   UserLanguage <chr>, Q1 <chr>, Q2 <chr>, Q2_3_TEXT <chr>, Q10 <chr>,
## #   Q3 <chr>, Q5 <chr>, Q6 <chr>, Q7 <chr>, Q7_3_TEXT <chr>, Q9 <chr>,
## #   Q1_num <dbl>

data %>% group_by(Q10) %>% count

## # A tibble: 4 x 2
## # Groups:   Q10 [4]
##   Q10       n
##   <chr> <int>
## 1 Maybe     4
## 2 No       18
## 3 Yes      11
## 4 <NA>     24

ggplot(data, aes(data$Q10)) + geom_bar()

heatmap_1 <- matrix(c(14,10,26,12,8,17),ncol=3,byrow=TRUE)
colnames(heatmap_1) <- c("PR","CR", "Report")
rownames(heatmap_1) <- c("Male","Female")
heatmap_1 <- as.table(heatmap_1)
heatmap_1

##        PR CR Report
## Male   14 10     26
## Female 12  8     17

heat_matrix <- data.matrix(heatmap_1)

Heatmap

Here I was trying to create a Heatmap to see patterns between Male-Female and who reported more than the other, and if male or female faced personal remark/community remark more than the other person.

malefemale_heatmap <- heatmap(heat_matrix, Rowv=NA, Colv=NA, col = heat.colors(256), scale="none", margins=c(5,5))

Total time

I was interested in doing some kind of t test to see which group Male or Female spends more time on Facebook. To do that, I converted new column which stores average number of times people login to their Fb account. I created another column which has total time spent by people each time they logged in. I then multipiled the two to get the total time spent with male/female ratio. Then, I ran t test.

data <- data %>%
  mutate(Q3_num = case_when(Q3 == "10-12 times a day" ~ "11",
                            Q3 == "Once a day" ~ "1",
                            Q3 == "1-3 times a day" ~ "1.5",
                            Q3 == "Once a week" ~ "0.14",
                            Q3 == "I don't keep a track" ~ "0",
                            Q3 == "" ~ "0",
                            Q3 == "NA" ~ "0"))
data <- data %>%
  mutate(Q5_num = case_when(Q5 == "1-5 minutes (Quickly checking notifications)" ~ "2.5",
                            Q5 == "An hour" ~ "60",
                            Q5 == "1-3 times a day" ~ "1.5",
                            Q5 == "10-15 minutes" ~ "12.5",
                            Q5 == "It really depends on the day" ~ "0",
                            Q5 == "I have no idea" ~ "0",
                            Q5 == "" ~ "0",
                            Q5 == "NA" ~ "0"))

Descriptive statistics for how many times Fb was accessed in a day

data <- data %>% drop_na(Q3_num) %>% mutate(Q3_num=as.numeric(Q3_num))
data <- data %>% drop_na(Q5_num) %>% mutate(Q5_num=as.numeric(Q5_num))
data <- data %>% mutate(total_time = Q3_num * Q5_num)

data %>% summarise(Q3_mean = mean(Q3_num),
Q3_median = median(Q3_num),
Q3_min = min(Q3_num),
Q3_max = max(Q3_num),
Q3_total = n(),
Q3_sd = sd(Q3_num))

## # A tibble: 1 x 6
##   Q3_mean Q3_median Q3_min Q3_max Q3_total Q3_sd
##     <dbl>     <dbl>  <dbl>  <dbl>    <int> <dbl>
## 1    5.90      1.50      0   11.0       31  5.04

ggplot(data %>% transmute(Q3_num=sqrt(Q3_num)), aes(Q3_num)) + geom_histogram(binwidth=1) #coord_cartesian(xlim=c(0,3100),ylim=c(0,15))

For those who only identify themselves as Females

females <- data %>% filter(Q7 == "Female") %>% mutate(Q3_num=log(Q3_num))
ggplot(females, aes(Q3_num)) + geom_histogram(binwidth=1)+ coord_cartesian(xlim=c(0,
10),ylim=c(0,10))

For those who only identify themselves as Mmales

males <- data %>% filter(Q7 == "Male") %>% mutate(Q3_num=log(Q3_num))
ggplot(males, aes(Q3_num)) + geom_histogram(binwidth=1)+ coord_cartesian(xlim=c(0,
10),ylim=c(0,10))

Descriptive statistics for how much time was spent on Fb each time that a person logged in (time in minutes)

data %>% summarise(Q5_mean = mean(Q5_num),
Q5_median = median(Q5_num),
Q5_min = min(Q5_num),
Q5_max = max(Q5_num),
Q5_total = n(),
Q5_sd = sd(Q5_num))

## # A tibble: 1 x 6
##   Q5_mean Q5_median Q5_min Q5_max Q5_total Q5_sd
##     <dbl>     <dbl>  <dbl>  <dbl>    <int> <dbl>
## 1    14.9      2.50      0   60.0       31  22.9

ggplot(data %>% transmute(Q3_num=sqrt(Q3_num)), aes(Q3_num)) + geom_histogram(binwidth=1) #coord_cartesian(xlim=c(0,3100),ylim=c(0,15))

For those who qualify themselves as females

females <- data %>% filter(Q7 == "Female") %>% mutate(Q5_num=log(Q5_num))
ggplot(females, aes(Q5_num)) + geom_histogram(binwidth=1)+ coord_cartesian(xlim=c(0,
10),ylim=c(0,10))

For those who qualify themselves as males

males <- data %>% filter(Q7 == "Male") %>% mutate(Q5_num=log(Q5_num))
ggplot(males, aes(Q5_num)) + geom_histogram(binwidth=1)+ coord_cartesian(xlim=c(0,
10),ylim=c(0,10))

Descriptive statistics for how much time was spent on Fb

data %>% summarise(t_mean = mean(total_time),
t_median = median(total_time),
t_min = min(total_time),
t_max = max(total_time),
t_total = n(),
t_sd = sd(total_time))

## # A tibble: 1 x 6
##   t_mean t_median t_min t_max t_total  t_sd
##    <dbl>    <dbl> <dbl> <dbl>   <int> <dbl>
## 1    112     18.8     0   660      31   219

ggplot(data %>% transmute(total_time=sqrt(total_time)), aes(total_time)) + geom_histogram(binwidth=1) #coord_cartesian(xlim=c(0,100),ylim=c(0,20))

Now running t test to see who uses more facebook between males and females

female <- data %>% filter(Q7 == "Female")
male <- data %>% filter(Q7 == "Male")
male_tt <- male %>% pull(total_time)
female_tt <- female %>% pull(total_time)
t.test(male_tt,female_tt,var.equal=TRUE)

## 
##  Two Sample t-test
## 
## data:  male_tt and female_tt
## t = -0.04528, df = 29, p-value = 0.9642
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -169.1646  161.8364
## sample estimates:
## mean of x mean of y 
##  109.6692  113.3333

megaplot <- data %>% select(Q7,Q3_num, Q5_num, total_time)

print(megaplot)

## # A tibble: 31 x 4
##    Q7     Q3_num Q5_num total_time
##    <chr>   <dbl>  <dbl>      <dbl>
##  1 Female 11.0     2.50     27.5  
##  2 Female 11.0    60.0     660    
##  3 Female 11.0    12.5     138    
##  4 Male    1.00    0         0    
##  5 Male   11.0     0         0    
##  6 Male    0.140   2.50      0.350
##  7 Male    0       0         0    
##  8 Male   11.0    60.0     660    
##  9 Female  1.50    0         0    
## 10 Female  1.50   60.0      90.0  
## # ... with 21 more rows

library(GGally)

## 
## Attaching package: 'GGally'

## The following object is masked from 'package:dplyr':
## 
##     nasa

ggparcoord(megaplot, columns = 1:4, groupColumn = 'Q7', scale = 'globalminmax')

## Warning in `[<-.factor`(`*tmp*`, ri, value = c(11, 11, 11, 1, 11, 0.14, :
## invalid factor level, NA generated

## Warning in `[<-.factor`(`*tmp*`, ri, value = c(11, 11, 11, 1, 11, 0.14, :
## invalid factor level, NA generated

## Warning in `[<-.factor`(`*tmp*`, ri, value = c(11, 11, 11, 1, 11, 0.14, :
## invalid factor level, NA generated

# group by combinations and count
#megaplot %>% group_by(Q7) %>% count()

# set an id string that denotes the value combination
megaplot %>% mutate(id = factor(paste(Q3_num, Q5_num, total_time, Q7, sep = '-')))

## # A tibble: 31 x 5
##    Q7     Q3_num Q5_num total_time id                  
##    <chr>   <dbl>  <dbl>      <dbl> <fct>               
##  1 Female 11.0     2.50     27.5   11-2.5-27.5-Female  
##  2 Female 11.0    60.0     660     11-60-660-Female    
##  3 Female 11.0    12.5     138     11-12.5-137.5-Female
##  4 Male    1.00    0         0     1-0-0-Male          
##  5 Male   11.0     0         0     11-0-0-Male         
##  6 Male    0.140   2.50      0.350 0.14-2.5-0.35-Male  
##  7 Male    0       0         0     0-0-0-Male          
##  8 Male   11.0    60.0     660     11-60-660-Male      
##  9 Female  1.50    0         0     1.5-0-0-Female      
## 10 Female  1.50   60.0      90.0   1.5-60-90-Female    
## # ... with 21 more rows

megaplot2 <- megaplot %>% mutate(id = factor(paste(Q3_num, Q5_num, total_time, Q7, sep = '-')))

head(megaplot2, n=31)

## # A tibble: 31 x 5
##    Q7     Q3_num Q5_num total_time id                  
##    <chr>   <dbl>  <dbl>      <dbl> <fct>               
##  1 Female 11.0     2.50     27.5   11-2.5-27.5-Female  
##  2 Female 11.0    60.0     660     11-60-660-Female    
##  3 Female 11.0    12.5     138     11-12.5-137.5-Female
##  4 Male    1.00    0         0     1-0-0-Male          
##  5 Male   11.0     0         0     11-0-0-Male         
##  6 Male    0.140   2.50      0.350 0.14-2.5-0.35-Male  
##  7 Male    0       0         0     0-0-0-Male          
##  8 Male   11.0    60.0     660     11-60-660-Male      
##  9 Female  1.50    0         0     1.5-0-0-Female      
## 10 Female  1.50   60.0      90.0   1.5-60-90-Female    
## # ... with 21 more rows

library(GGally)

ggparcoord(megaplot2, columns = 2:4, groupColumn = 'id', scale = 'globalminmax', missing = 'exclude', title = "Parallel Coordinate Plot for Male-Female total times logged in, time spent during each logged session and total time ")

mp3<- as.data.frame(megaplot2)
library(ggparallel)
ggparallel(list('Q7','total_time', 'Q3_num', 'Q5_num'), mp3, order = 0)

This is the end of results—————–

Final data project: Facebook trolling