Preferência Hidrossocial
Carregando arquivos e agrupando
library(pdftools)
library(tidytext)
EntrevistaAmauri <- pdf_text("AmauriPolachi_CBHAT.pdf")
EntrevistaCica <- pdf_text("Ciça_AliançaPelaÁgua.pdf")
EntrevistaEdson <- pdf_text("EdsonAparecido_Coletivo de luta pela água.pdf")
EntrevistaMarciaN <- pdf_text("MárciaNascimento_SMASP.pdf")
EntrevistaKachel <- pdf_text("Kachel_UMC_ExSABESP.pdf")
EntrevistaMarzeni <- pdf_text("Marzeni_ColetivoAguaSim_ExSABESP.pdf")
EntrevistaStela <- pdf_text("StelaGoldenstein_AguasClaras.pdf")
EntrevistaCibim <- pdf_text("JulianaCibim_AliançaPelaÁgua.pdf")
EntrevistaLuizDeus <- pdf_text("LuizdeDeus_AssociaçãoSenhorBonfim.pdf")
Entrevista_Maru <- pdf_text("Maru_AliançapelaAgua.pdf")
EntrevistaMonicaRos <- pdf_text("MonicaRossi_CDHU.pdf" )
Entrevista_Tagnin <- pdf_text("Tagnin_SENAC.pdf")
Entrevista_Virgilio <- pdf_text ("Virgilio _MDV.pdf")
Entrevista_Mazolenis <- pdf_text("EduardoMazolenis_Cetesb.pdf")
Entrevista_RicardoCastro <- pdf_text ("Ricardo Castro_MP.pdf")
EntrevPrefHidro <- c(EntrevistaAmauri, EntrevistaCica, EntrevistaEdson, EntrevistaMarciaN,
EntrevistaKachel, EntrevistaMarzeni, EntrevistaStela, EntrevistaCibim,
EntrevistaLuizDeus, Entrevista_Maru, EntrevistaMonicaRos, Entrevista_Tagnin, Entrevista_Virgilio, Entrevista_Mazolenis, Entrevista_RicardoCastro)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(stringr)
library(tm)
## Loading required package: NLP
EntrevPrefHidro <- EntrevPrefHidro %>%
paste(collapse = "") %>%
str_remove_all(c("\r", "\n")) %>%
str_to_lower() %>%
removeNumbers() %>%
removePunctuation() %>%
removeNumbers()
Tokenização
tokensHidro <- str_split(EntrevPrefHidro, " ")
tokensHidro <- unlist(tokensHidro)
Criando o Data Frame
entrevista_df <- data_frame(id_discurso = 1:length(tokensHidro),
text = tokensHidro)
entrevista_token <- entrevista_df %>%
unnest_tokens(word, text)
Stopwords
stopwords_pt <- c(stopwords("pt"), "que", "é", "entrevistado",
"entrevistador", "pra", "porque", "r", "nentrevistador",
"nentrevistado", "n", "questão", "vai", "ai",
"aqui", "sobre", "assim", "etc","pois", "desse", "né", "aí", "paulo",
"ainda", "então", "gente", "ser", "joão", "ricardo", "de", "lá",
"acho", "ter", "sim", "coisa", "fazer", "estar", "fazendo", "d")
stopwords_pt_df <- data.frame(word = stopwords_pt)
entrevista_token <- entrevista_token %>%
anti_join(stopwords_pt_df, by = "word")
## Warning: Column `word` joining character vector and factor, coercing into
## character vector
Grafico de frequencia de palavras
entrevista_token %>%
count(word, sort = TRUE)
## # A tibble: 7,210 x 2
## word n
## <chr> <int>
## 1 água 1390
## 2 gestão 552
## 3 billings 468
## 4 recursos 456
## 5 sabesp 402
## 6 lei 390
## 7 estado 352
## 8 hídricos 348
## 9 crise 336
## 10 pessoas 306
## # ... with 7,200 more rows
library(ggplot2)
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
##
## annotate
entrevista_token %>%
count(word, sort = TRUE) %>%
filter(n > 270) %>%
mutate(word = reorder(word, n)) %>%
ggplot()+
geom_col(aes(word, n),colour="black", fill= "green") +
xlab(NULL) +
coord_flip()

Nuvem de palavras
library(wordcloud)
## Loading required package: RColorBrewer
entrevista_token %>%
count(word, sort = T) %>%
with(wordcloud(word, n, use.r.layout = TRUE, max.words = 50))

Rede de Bigrams
entrevista_bigrams <- entrevista_df %>%
unnest_tokens(bigram, text, token = "ngrams", n = 2)
entrevista_bigrams <- entrevista_bigrams %>%
filter(!(is.na(bigram)))
entrevista_bigrams %>%
count(bigram, sort = TRUE)
## # A tibble: 5,709 x 2
## bigram n
## <chr> <int>
## 1 a gente 50
## 2 de recursos 36
## 3 recursos hídricos 34
## 4 da billings 32
## 5 região metropolitana 30
## 6 billings entrevistado 28
## 7 de saneamento 26
## 8 o que 26
## 9 água entrevistado 24
## 10 da sabesp 24
## # ... with 5,699 more rows
Stopwords
library(tidyr)
bigrams_separated <- entrevista_bigrams %>%
separate(bigram, c("word1", "word2"), sep = " ")
bigrams_filtered <- bigrams_separated %>%
filter(!word1 %in% stopwords_pt) %>%
filter(!word2 %in% stopwords_pt)
bigrams_filtered <- bigrams_separated %>%
anti_join(stopwords_pt_df, by = c("word1" = "word")) %>%
anti_join(stopwords_pt_df, by = c("word2" = "word"))
## Warning: Column `word1`/`word` joining character vector and factor,
## coercing into character vector
## Warning: Column `word2`/`word` joining character vector and factor,
## coercing into character vector
bigram_counts <- bigrams_filtered %>%
count(word1, word2, sort = TRUE)
bigrams_united <- bigrams_filtered %>%
unite(bigram, word1, word2, sep = " ")
Rede
library(igraph)
##
## Attaching package: 'igraph'
## The following object is masked from 'package:tidyr':
##
## crossing
## The following objects are masked from 'package:dplyr':
##
## as_data_frame, groups, union
## The following objects are masked from 'package:stats':
##
## decompose, spectrum
## The following object is masked from 'package:base':
##
## union
library(ggraph)
bigram_graph <- bigram_counts %>%
filter(n > 2) %>%
graph_from_data_frame()
set.seed(2016)
a <- grid::arrow(type = "open", length = unit(.10, "inches"))
ggraph(bigram_graph, layout = "fr") +
geom_edge_link(show.legend = FALSE, arrow=a, end_cap = circle(.05, 'inches')) +
geom_node_point(color = "lightgreen", size = 5) +
geom_node_text(aes(label = name), vjust = 1, hjust = 1) +
theme_void()
