Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adds the link_filter.yml file to the default configuration when running ACHE as a REST server #175

Open
wants to merge 10 commits into
base: master
Choose a base branch
from
60 changes: 59 additions & 1 deletion src/main/java/focusedCrawler/crawler/CrawlersManager.java
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,10 @@
import java.util.zip.ZipEntry;
import java.util.zip.ZipFile;

import focusedCrawler.link.LinkFilterConfig;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

Expand All @@ -33,9 +36,11 @@

public class CrawlersManager {

private static final String LINK_FILTERS_FILE = "/link_filters.yml";
private static Logger logger = LoggerFactory.getLogger(CrawlersManager.class);

private Configuration baseConfig;
private LinkFilterConfig linkFilterConfig;
private String baseDataPath;

private Map<String, CrawlContext> crawlers = new HashMap<>();
Expand All @@ -49,6 +54,12 @@ public CrawlersManager(String baseDataPath, Configuration baseConfig) {
this.baseDataPath = baseDataPath;
}

public CrawlersManager(String baseDataPath, Configuration baseConfig, LinkFilterConfig linkFilterConfig) {
this.baseConfig = baseConfig;
this.baseDataPath = baseDataPath;
this.linkFilterConfig = linkFilterConfig;
}

public CrawlContext startCrawl(String crawlerId) {
if (!crawlers.containsKey(crawlerId)) {
throw new IllegalArgumentException("No crawler with crawler_id: " + crawlerId);
Expand All @@ -67,7 +78,8 @@ public Map<String, CrawlContext> getCrawls() {
}

public CrawlContext createCrawler(String crawlerId, StartCrawlParams params) throws Exception {
return createCrawler(crawlerId, params.crawlType, params.seeds, params.model,
return createCrawler(crawlerId, params.crawlType, params.seeds,
params.whitelist, params.blacklist, params.model,
params.esIndexName, params.esTypeName);
}

Expand All @@ -82,6 +94,7 @@ public CrawlContext createCrawler(String crawlerId, CrawlType crawlType, List<St

Path configPath = Paths.get(baseDataPath, crawlerId, "config");
createConfigForCrawlType(baseConfig, configPath, crawlType, esIndexName, esTypeName);
addLinkFiltersToCrawlConfig(linkFilterConfig.getFileLocation(), configPath);

String modelPath = storeModelFile(model, configPath.resolve("model"));
String seedPath = getSeedForCrawlType(crawlType, seeds, configPath, modelPath);
Expand All @@ -90,6 +103,33 @@ public CrawlContext createCrawler(String crawlerId, CrawlType crawlType, List<St
esTypeName);
}

public CrawlContext createCrawler(String crawlerId, CrawlType crawlType, List<String> seeds,
List<String> whitelist, List<String> blacklist,
byte[] model, String esIndexName, String esTypeName)
throws Exception {

Path configPath = Paths.get(baseDataPath, crawlerId, "config");
createConfigForCrawlType(baseConfig, configPath, crawlType, esIndexName, esTypeName);
addLinkFiltersToCrawlConfig(linkFilterConfig.getFileLocation(), configPath);

String modelPath = storeModelFile(model, configPath.resolve("model"));
String seedPath = getSeedForCrawlType(crawlType, seeds, configPath, modelPath);

return createCrawler(crawlerId, configPath.toString(), seedPath, modelPath,
whitelist, blacklist, esIndexName, esTypeName);
}

private void addLinkFiltersToCrawlConfig(String configurationPath, Path configPath) throws IOException {
if (StringUtils.isNotEmpty(configurationPath)) {
File linkFilters = new File(configurationPath + LINK_FILTERS_FILE);
File linkFilterOutFile = new File(configPath + LINK_FILTERS_FILE);

FileUtils.copyFile(linkFilters, linkFilterOutFile);
} else {
logger.info("No Link Filters to load.");
}
}

public CrawlContext createCrawler(String crawlerId, String configPath, String seedPath,
String modelPath,
String esIndexName, String esTypeName) throws Exception {
Expand All @@ -108,6 +148,24 @@ public CrawlContext createCrawler(String crawlerId, String configPath, String se
return context;
}

public CrawlContext createCrawler(String crawlerId, String configPath, String seedPath,
String modelPath, List<String> whitelist, List<String> blacklist,
String esIndexName, String esTypeName) throws Exception {

String dataPath = Paths.get(baseDataPath, crawlerId).toString();

CrawlContext context = new CrawlContext();
context.crawlerId = crawlerId;
context.dataPath = dataPath;
context.seedPath = seedPath;
context.modelPath = modelPath;
context.crawler = AsyncCrawler.create(crawlerId, configPath, dataPath, seedPath, modelPath,
whitelist, blacklist, esIndexName, esTypeName);

crawlers.put(crawlerId, context);
return context;
}

private Configuration createConfigForCrawlType(Configuration baseConfig, Path configPath,
CrawlType crawlType, String esIndexName, String esTypeName) throws IOException {

Expand Down
43 changes: 41 additions & 2 deletions src/main/java/focusedCrawler/crawler/async/AsyncCrawler.java
Original file line number Diff line number Diff line change
Expand Up @@ -104,8 +104,9 @@ public void shutDown() {
logger.info("Shutdown finished.");
}

public static AsyncCrawler create(String crawlerId, String configPath, String dataPath, String seedPath,
String modelPath, String esIndexName, String esTypeName) throws Exception {
public static AsyncCrawler create(String crawlerId, String configPath, String dataPath,
String seedPath, String modelPath, String esIndexName, String esTypeName)
throws Exception {

Configuration config = new Configuration(configPath);

Expand All @@ -121,6 +122,44 @@ public static AsyncCrawler create(String crawlerId, String configPath, String da
return new AsyncCrawler(crawlerId, targetStorage, linkStorage, config, dataPath, metricsManager);
}

public static AsyncCrawler create(String crawlerId, String configPath, String dataPath,
String seedPath, String modelPath,
List<String> whitelist, List<String> blacklist,
String esIndexName, String esTypeName)
throws Exception {

Configuration config = new Configuration(configPath);

MetricsManager metricsManager = new MetricsManager(false, dataPath);

LinkStorage linkStorage = LinkStorage.create(configPath, seedPath, dataPath,
modelPath, whitelist, blacklist, config.getLinkStorageConfig(), metricsManager);

TargetStorage targetStorage = TargetStorage.create(configPath, modelPath, dataPath,
esIndexName, esTypeName, config.getTargetStorageConfig(), linkStorage,
metricsManager);

return new AsyncCrawler(crawlerId, targetStorage, linkStorage, config, dataPath, metricsManager);
}

public static AsyncCrawler create(String crawlerId, String configPath, String overrideConfigPath, String dataPath,
String seedPath, String modelPath, String esIndexName, String esTypeName)
throws Exception {

Configuration config = new Configuration(configPath);

MetricsManager metricsManager = new MetricsManager(false, dataPath);

LinkStorage linkStorage = LinkStorage.create(overrideConfigPath, seedPath, dataPath,
modelPath, config.getLinkStorageConfig(), metricsManager);

TargetStorage targetStorage = TargetStorage.create(configPath, modelPath, dataPath,
esIndexName, esTypeName, config.getTargetStorageConfig(), linkStorage,
metricsManager);

return new AsyncCrawler(crawlerId, targetStorage, linkStorage, config, dataPath, metricsManager);
}

public MetricsManager getMetricsManager() {
return metricsManager;
}
Expand Down
97 changes: 97 additions & 0 deletions src/main/java/focusedCrawler/link/LinkFilterConfig.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
package focusedCrawler.link;

import com.fasterxml.jackson.annotation.JsonProperty;
import com.fasterxml.jackson.databind.DeserializationFeature;
import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.fasterxml.jackson.dataformat.yaml.YAMLFactory;
import org.apache.commons.lang3.StringUtils;

import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.Collections;
import java.util.List;

public class LinkFilterConfig {

private static final ObjectMapper yamlMapper = new ObjectMapper(new YAMLFactory());
private static final String LINK_FILTERS_FILE = "link_filters.yml";

static {
yamlMapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
}

@JsonProperty("global.type")
private String type = "wildcard";

@JsonProperty("global.whitelist")
private List<String> whitelist = Collections.emptyList();

@JsonProperty("global.blacklist")
private List<String> blacklist = Collections.emptyList();

private String fileLocation;

public LinkFilterConfig() {
}

public LinkFilterConfig(String configPath) {
this(Paths.get(configPath));
}

public LinkFilterConfig(Path linkFiltersPath) {
Path linkFiltersFile;
if (Files.isDirectory(linkFiltersPath)) {
linkFiltersFile = linkFiltersPath.resolve(LINK_FILTERS_FILE);
} else {
linkFiltersFile = linkFiltersPath;
}
try {
init(yamlMapper.readTree(linkFiltersFile.toFile()));
} catch (IOException e) {
throw new IllegalArgumentException("Could not read config from file: " + linkFiltersFile.toString(), e);
}
}

private void init(JsonNode linkFilters) throws IOException {
yamlMapper.readerForUpdating(this).readValue(linkFilters);
}

public String getType() {
return type;
}

public List<String> getWhitelist() {
return whitelist;
}

public List<String> getBlacklist() {
return blacklist;
}

public void setLinkFilterType(String type) {
this.type = type;
}

public void setWhiteList(List<String> whitelist) {
this.whitelist = whitelist;
}

public void setBlackList(List<String> blacklist) {
this.blacklist = blacklist;
}

public String getFileLocation() {
if (StringUtils.isNotEmpty(fileLocation)) {
return fileLocation.trim();
} else {
return "";
}
}

public void setFileLocation(String fileLocation) {
this.fileLocation = fileLocation;
}
}
27 changes: 27 additions & 0 deletions src/main/java/focusedCrawler/link/LinkStorage.java
Original file line number Diff line number Diff line change
Expand Up @@ -177,6 +177,33 @@ public static LinkStorage create(String configPath, String seedFile, String data
FrontierManager frontierManager = FrontierManagerFactory.create(config, configPath,
dataPath, modelPath, seedFile, metricsManager);

OnlineLearning onlineLearning = null;
if (config.isUseOnlineLearning()) {
onlineLearning = createOnlineLearning(dataPath, config, stoplist, frontierManager);

}

return new LinkStorage(config, frontierManager, onlineLearning);
}

public static LinkStorage create(String configPath, String seedFile, String dataPath,
String modelPath, List<String> whitelist, List<String> blacklist,
LinkStorageConfig config, MetricsManager metricsManager)
throws FrontierPersistentException, IOException {

Path stoplistPath = Paths.get(configPath, "/stoplist.txt");
StopList stoplist;
if (Files.exists(stoplistPath)) {
stoplist = new StopListFile(stoplistPath.toFile().getCanonicalPath());
} else {
stoplist = StopListFile.DEFAULT;
}

LinkClassifierFactory.setDefaultStoplist(stoplist);

FrontierManager frontierManager = FrontierManagerFactory.create(config, configPath,
dataPath, modelPath, whitelist, blacklist, seedFile, metricsManager);

OnlineLearning onlineLearning = null;
if (config.isUseOnlineLearning()) {
onlineLearning = createOnlineLearning(dataPath, config, stoplist, frontierManager);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,11 @@
import focusedCrawler.util.LinkFilter;
import focusedCrawler.util.MetricsManager;
import focusedCrawler.util.ParameterFile;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.nio.file.Paths;
import java.util.List;

public class FrontierManagerFactory {

Expand All @@ -27,14 +32,36 @@ public static FrontierManager create(LinkStorageConfig config, String configPath

String directory = Paths.get(dataPath, config.getLinkDirectory()).toString();

Frontier frontier = new Frontier(directory, config.getMaxCacheUrlsSize(),
config.getPersistentHashtableBackend());
Frontier frontier = new Frontier(directory, config.getMaxCacheUrlsSize(), config.getPersistentHashtableBackend());

LinkFilter linkFilter = new LinkFilter.Builder().withConfigPath(configPath).build();

LinkSelector linkSelector = createLinkSelector(config);
logger.info("LINK_SELECTOR: " + linkSelector.getClass().getName());

return createFrontierManager(config, dataPath, modelPath, seedFile,
metricsManager, frontier, linkFilter, linkSelector);
}

public static FrontierManager create(LinkStorageConfig config, String configPath,
String dataPath, String modelPath,
List<String> whitelist, List<String> blacklist,
String seedFile, MetricsManager metricsManager) {

String directory = Paths.get(dataPath, config.getLinkDirectory()).toString();

Frontier frontier = new Frontier(directory, config.getMaxCacheUrlsSize(), config.getPersistentHashtableBackend());

LinkFilter linkFilter = new LinkFilter.Builder().withConfigPath(configPath, whitelist, blacklist).build();

LinkSelector linkSelector = createLinkSelector(config);
logger.info("LINK_SELECTOR: " + linkSelector.getClass().getName());

return createFrontierManager(config, dataPath, modelPath, seedFile,
metricsManager, frontier, linkFilter, linkSelector);
}

private static FrontierManager createFrontierManager(LinkStorageConfig config, String dataPath, String modelPath, String seedFile, MetricsManager metricsManager, Frontier frontier, LinkFilter linkFilter, LinkSelector linkSelector) {
LinkSelector recrawlSelector = createRecrawlSelector(config);

FrontierManager frontierManager = new FrontierManager(frontier, dataPath, modelPath, config,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -179,7 +179,8 @@ public CrawlerResource(CrawlersManager crawlManager) {

try {
HashMap<String, List<Cookie>> params = json.readValue(request.body(),
new TypeReference<HashMap<String, List<Cookie>>>() {});
new TypeReference<HashMap<String, List<Cookie>>>() {
});

if (params == null || params.isEmpty()) {
response.status(HttpServletResponse.SC_BAD_REQUEST);
Expand Down Expand Up @@ -215,6 +216,8 @@ private Optional<Boolean> getParamAsBoolean(String paramName, Request request) {
public static class StartCrawlParams {
public CrawlType crawlType;
public List<String> seeds;
public List<String> blacklist;
public List<String> whitelist;
public byte[] model;
public String esTypeName;
public String esIndexName;
Expand Down
7 changes: 6 additions & 1 deletion src/main/java/focusedCrawler/tools/StartRestServer.java
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import focusedCrawler.Main;
import focusedCrawler.config.Configuration;
import focusedCrawler.crawler.CrawlersManager;
import focusedCrawler.link.LinkFilterConfig;
import focusedCrawler.rest.RestServer;
import focusedCrawler.util.CliTool;
import io.airlift.airline.Command;
Expand Down Expand Up @@ -36,12 +37,16 @@ public static void main(String[] args) throws Exception {
@Override
public void execute() throws Exception {
Configuration baseConfig;
LinkFilterConfig linkFilterConfig;
if (configPath != null && !configPath.isEmpty()) {
baseConfig = new Configuration(configPath);
linkFilterConfig = new LinkFilterConfig(configPath);
linkFilterConfig.setFileLocation(configPath);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This setter doesn't seem necessary since configPath is already passed in the constructor.

} else {
baseConfig = new Configuration();
linkFilterConfig = new LinkFilterConfig();
}
CrawlersManager crawlManager = new CrawlersManager(dataPath, baseConfig);
CrawlersManager crawlManager = new CrawlersManager(dataPath, baseConfig, linkFilterConfig);
RestServer server = RestServer.create(baseConfig.getRestConfig(), crawlManager);
server.start();
}
Expand Down
Loading