Skip to content

Commit

Permalink
[webcrawler] Do not fail in case of unknown properties in the JSON fi…
Browse files Browse the repository at this point in the history
…le (handle backward compatibility) (#469)
  • Loading branch information
eolivelli authored Sep 21, 2023
1 parent c0248c3 commit e9f9ad2
Show file tree
Hide file tree
Showing 2 changed files with 54 additions and 3 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.LinkedBlockingQueue;
import java.util.concurrent.atomic.AtomicInteger;
import lombok.Getter;
import lombok.extern.slf4j.Slf4j;

@Slf4j
Expand All @@ -74,7 +75,7 @@ public class WebCrawlerSource extends AbstractAgentCode implements AgentSource {
private MinioClient minioClient;
private int reindexIntervalSeconds;

private String statusFileName;
@Getter private String statusFileName;

private WebCrawler crawler;

Expand Down Expand Up @@ -107,7 +108,7 @@ public void init(Map<String, Object> configuration) throws Exception {
allowedDomains = getSet("allowed-domains", configuration);
forbiddenPaths = getSet("forbidden-paths", configuration);
maxUrls = getInt("max-urls", 1000, configuration);
int maxDepth = getInt("max-depth", 10, configuration);
int maxDepth = getInt("max-depth", 50, configuration);
handleRobotsFile = getBoolean("handle-robots-file", true, configuration);
scanHtmlDocuments = getBoolean("scan-html-documents", true, configuration);
seedUrls = getSet("seed-urls", configuration);
Expand Down Expand Up @@ -394,7 +395,12 @@ public Status getCurrentStatus() throws Exception {
.build());
byte[] content = result.readAllBytes();
log.info("Restoring status from {}, {} bytes", statusFileName, content.length);
return MAPPER.readValue(content, Status.class);
try {
return MAPPER.readValue(content, Status.class);
} catch (IOException e) {
log.error("Error parsing status file, restarting from scratch", e);
return null;
}
} catch (ErrorResponseException e) {
if (e.errorResponse().code().equals("NoSuchKey")) {
log.info("No status file found, starting from scratch");
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,11 @@
import ai.langstream.api.runner.topics.TopicProducer;
import com.github.tomakehurst.wiremock.junit5.WireMockRuntimeInfo;
import com.github.tomakehurst.wiremock.junit5.WireMockTest;
import io.minio.MakeBucketArgs;
import io.minio.MinioClient;
import io.minio.PutObjectArgs;
import java.io.ByteArrayInputStream;
import java.nio.charset.StandardCharsets;
import java.nio.file.Path;
import java.util.HashMap;
import java.util.HashSet;
Expand Down Expand Up @@ -422,4 +426,45 @@ public Path getCodeDirectory() {
agentSource.start();
return (WebCrawlerSource) agentSource;
}

@Test
void testRecoverFromWrongJsonFile() throws Exception {
String bucket = "langstream-test-" + UUID.randomUUID();
String url = "https://www.datastax.com/";
String allowed = "https://www.datastax.com/";

String objectName = "test-global-agent-id.webcrawler.status.json";
String json =
"""
{
"some-field": "some-value"
}
""";
minioClient.makeBucket(MakeBucketArgs.builder().bucket(bucket).build());
minioClient.putObject(
PutObjectArgs.builder()
.bucket(bucket)
.contentType("application/json")
.object(objectName)
.stream(
new ByteArrayInputStream(json.getBytes(StandardCharsets.UTF_8)),
json.length(),
5 * 1024 * 1024)
.build());
WebCrawlerSource agentSource =
buildAgentSource(
bucket,
allowed,
Set.of(),
url,
Map.of(
"reindex-interval-seconds",
"3600",
"scan-html-documents",
"false",
"max-urls",
10000));
assertEquals(objectName, agentSource.getStatusFileName());
agentSource.close();
}
}

0 comments on commit e9f9ad2

Please sign in to comment.