-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathinternal_links_identification.R
28 lines (27 loc) · 1.11 KB
/
internal_links_identification.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
#Identification of “internal" links
#NB : the use of prefixes is associated with the products generated by the web crawler Hyphe (http://hyphe.medialab.sciences-po.fr/). An adaptation to other crawlers is possible.
#Requires library : stringr
#Requires files : data.frame htmls_edges (source, target, id) - data.frame prefixes (id_site, multi_prefixes, n_prefixes, name, prefixe 1… prefixe Nmax)
#Products : data.frame htmls_edges_IN (source, target, weight)
sort_edges <- function(htmls_edges, prefixes)
{
htmls_edges_IN <- htmls_edges
sites = prefixes$id_site
for (x in 1:length(sites))
{
prefixe <- prefixes[which(prefixes$id_site == sites[x]),]
INx <- c()
for (y in 5:(4+prefixe$n_prefixes))
{
a <- grep(tolower(prefixe[y]),tolower(htmls_edges$target))
INx <- c(INx,a)
htmls_edges_IN[a,2] <- sites[x]
}
IN=c(IN,unique(INx))
}
htmls_edges_IN <- htmls_edges_IN[IN,]
htmls_edges_IN <- htmls_edges_IN[which(htmls_edges_IN$source != htmls_edges_IN$target),]
htmls_edges_IN <- as.data.frame(table(htmls_edges_IN[,1],htmls_edges_IN[,2]))
names(htmls_edges_IN) <- c(“source”, “target”, “weight”)
return(htmls_edges_IN)
}