Preferência Hidrossocial

Carregando arquivos e agrupando

library(pdftools)
library(tidytext)
EntrevistaAmauri <- pdf_text("AmauriPolachi_CBHAT.pdf")
EntrevistaCica <- pdf_text("Ciça_AliançaPelaÁgua.pdf")
EntrevistaEdson <- pdf_text("EdsonAparecido_Coletivo de luta pela água.pdf")
EntrevistaMarciaN <- pdf_text("MárciaNascimento_SMASP.pdf")
EntrevistaKachel <- pdf_text("Kachel_UMC_ExSABESP.pdf")
EntrevistaMarzeni <- pdf_text("Marzeni_ColetivoAguaSim_ExSABESP.pdf")
EntrevistaStela <- pdf_text("StelaGoldenstein_AguasClaras.pdf")
EntrevistaCibim <- pdf_text("JulianaCibim_AliançaPelaÁgua.pdf")
EntrevistaLuizDeus <- pdf_text("LuizdeDeus_AssociaçãoSenhorBonfim.pdf")
Entrevista_Maru <- pdf_text("Maru_AliançapelaAgua.pdf")
EntrevistaMonicaRos <- pdf_text("MonicaRossi_CDHU.pdf" )
Entrevista_Tagnin <- pdf_text("Tagnin_SENAC.pdf")
Entrevista_Virgilio <- pdf_text ("Virgilio _MDV.pdf")
Entrevista_Mazolenis <- pdf_text("EduardoMazolenis_Cetesb.pdf")
Entrevista_RicardoCastro <- pdf_text ("Ricardo Castro_MP.pdf")
EntrevPrefHidro <- c(EntrevistaAmauri, EntrevistaCica, EntrevistaEdson, EntrevistaMarciaN,
                     EntrevistaKachel, EntrevistaMarzeni, EntrevistaStela, EntrevistaCibim,
                     EntrevistaLuizDeus, Entrevista_Maru, EntrevistaMonicaRos, Entrevista_Tagnin, Entrevista_Virgilio, Entrevista_Mazolenis, Entrevista_RicardoCastro)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(stringr)
library(tm)
## Loading required package: NLP
EntrevPrefHidro <- EntrevPrefHidro %>%
  paste(collapse = "") %>%
  str_remove_all(c("\r", "\n")) %>%
  str_to_lower() %>%
  removeNumbers() %>%
  removePunctuation() %>%
  removeNumbers()

Tokenização

tokensHidro <- str_split(EntrevPrefHidro, " ")

tokensHidro <- unlist(tokensHidro)

Criando o Data Frame

entrevista_df <- data_frame(id_discurso = 1:length(tokensHidro), 
                           text = tokensHidro)
entrevista_token <- entrevista_df %>%
  unnest_tokens(word, text)

Stopwords

stopwords_pt <- c(stopwords("pt"), "que", "é", "entrevistado",
                  "entrevistador", "pra", "porque", "r", "nentrevistador", 
                  "nentrevistado", "n", "questão", "vai", "ai",
                  "aqui", "sobre", "assim", "etc","pois", "desse", "né", "aí", "paulo",
                  "ainda", "então", "gente", "ser", "joão", "ricardo", "de", "lá", 
                  "acho", "ter", "sim", "coisa", "fazer", "estar", "fazendo", "d")

stopwords_pt_df <- data.frame(word = stopwords_pt)

entrevista_token <- entrevista_token %>%
  anti_join(stopwords_pt_df, by = "word")
## Warning: Column `word` joining character vector and factor, coercing into
## character vector

Grafico de frequencia de palavras

entrevista_token %>%
  count(word, sort = TRUE)
## # A tibble: 7,210 x 2
##    word         n
##    <chr>    <int>
##  1 água      1390
##  2 gestão     552
##  3 billings   468
##  4 recursos   456
##  5 sabesp     402
##  6 lei        390
##  7 estado     352
##  8 hídricos   348
##  9 crise      336
## 10 pessoas    306
## # ... with 7,200 more rows
library(ggplot2)
## 
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
## 
##     annotate
entrevista_token %>%
  count(word, sort = TRUE) %>%
  filter(n > 270) %>%
  mutate(word = reorder(word, n)) %>%
  ggplot()+
  geom_col(aes(word, n),colour="black", fill= "green") +
  xlab(NULL) +
  coord_flip()

Nuvem de palavras

library(wordcloud)
## Loading required package: RColorBrewer
entrevista_token %>%
  count(word, sort = T) %>%
  with(wordcloud(word, n, use.r.layout = TRUE, max.words = 50))

Rede de Bigrams

entrevista_bigrams <- entrevista_df %>%
  unnest_tokens(bigram, text, token = "ngrams", n = 2)

entrevista_bigrams <- entrevista_bigrams %>% 
  filter(!(is.na(bigram)))

entrevista_bigrams %>%
  count(bigram, sort = TRUE)
## # A tibble: 5,709 x 2
##    bigram                    n
##    <chr>                 <int>
##  1 a gente                  50
##  2 de recursos              36
##  3 recursos hídricos        34
##  4 da billings              32
##  5 região metropolitana     30
##  6 billings entrevistado    28
##  7 de saneamento            26
##  8 o que                    26
##  9 água entrevistado        24
## 10 da sabesp                24
## # ... with 5,699 more rows

Stopwords

library(tidyr)

bigrams_separated <- entrevista_bigrams %>%
  separate(bigram, c("word1", "word2"), sep = " ")

bigrams_filtered <- bigrams_separated %>%
  filter(!word1 %in% stopwords_pt) %>%
  filter(!word2 %in% stopwords_pt)

bigrams_filtered <- bigrams_separated %>%
  anti_join(stopwords_pt_df, by = c("word1" = "word")) %>%
  anti_join(stopwords_pt_df, by = c("word2" = "word"))
## Warning: Column `word1`/`word` joining character vector and factor,
## coercing into character vector
## Warning: Column `word2`/`word` joining character vector and factor,
## coercing into character vector
bigram_counts <- bigrams_filtered %>% 
  count(word1, word2, sort = TRUE)

bigrams_united <- bigrams_filtered %>%
  unite(bigram, word1, word2, sep = " ")

Rede

library(igraph)
## 
## Attaching package: 'igraph'
## The following object is masked from 'package:tidyr':
## 
##     crossing
## The following objects are masked from 'package:dplyr':
## 
##     as_data_frame, groups, union
## The following objects are masked from 'package:stats':
## 
##     decompose, spectrum
## The following object is masked from 'package:base':
## 
##     union
library(ggraph)

bigram_graph <- bigram_counts %>%
  filter(n > 2) %>%
  graph_from_data_frame()

set.seed(2016)

a <- grid::arrow(type = "open", length = unit(.10, "inches"))


ggraph(bigram_graph, layout = "fr") +
  geom_edge_link(show.legend = FALSE, arrow=a, end_cap = circle(.05, 'inches')) +
  geom_node_point(color = "lightgreen", size = 5) +
  geom_node_text(aes(label = name), vjust = 1, hjust = 1) +
  theme_void()