public class CommonCrawlFormatWARC extends AbstractCommonCrawlFormat
Modifier and Type | Field and Description |
---|---|
static String |
MAX_WARC_FILE_SIZE |
static String |
TEMPLATE |
conf, content, inLinks, jsonArray, keyPrefix, LOG, metadata, reverseKey, reverseKeyValue, simpleDateFormat, url
Constructor and Description |
---|
CommonCrawlFormatWARC(Configuration nutchConf,
CommonCrawlConfig config) |
CommonCrawlFormatWARC(String url,
Content content,
Metadata metadata,
Configuration nutchConf,
CommonCrawlConfig config,
ParseData parseData) |
Modifier and Type | Method and Description |
---|---|
void |
close()
Optional method that could be implemented if the actual format needs some
close procedure.
|
protected void |
closeArray(String key,
boolean nested,
boolean newline) |
protected void |
closeObject(String key) |
protected String |
generateJson() |
String |
getJsonData() |
String |
getJsonData(String url,
Content content,
Metadata metadata,
ParseData parseData)
Returns a string representation of the JSON structure of the URL content
takes into account the parsed metadata about the URL
|
protected void |
startArray(String key,
boolean nested,
boolean newline) |
protected void |
startObject(String key) |
protected void |
writeArrayValue(String value) |
protected void |
writeKeyNull(String key) |
protected void |
writeKeyValue(String key,
String value) |
protected URI |
writeRequest(URI id) |
protected URI |
writeResponse() |
getImported, getInLinks, getJsonData, getKey, getMethod, getRequestAccept, getRequestAcceptEncoding, getRequestAcceptLanguage, getRequestContactEmail, getRequestContactName, getRequestHostAddress, getRequestHostName, getRequestRobots, getRequestSoftware, getRequestUserAgent, getResponseAddress, getResponseContent, getResponseContentEncoding, getResponseContentType, getResponseDate, getResponseHostName, getResponseServer, getResponseStatus, getTimestamp, getUrl, setInLinks
public static final String MAX_WARC_FILE_SIZE
public static final String TEMPLATE
public CommonCrawlFormatWARC(Configuration nutchConf, CommonCrawlConfig config) throws IOException
IOException
public CommonCrawlFormatWARC(String url, Content content, Metadata metadata, Configuration nutchConf, CommonCrawlConfig config, ParseData parseData) throws IOException
IOException
public String getJsonData(String url, Content content, Metadata metadata, ParseData parseData) throws IOException
CommonCrawlFormat
getJsonData
in interface CommonCrawlFormat
getJsonData
in class AbstractCommonCrawlFormat
IOException
public String getJsonData() throws IOException
getJsonData
in interface CommonCrawlFormat
getJsonData
in class AbstractCommonCrawlFormat
IOException
protected URI writeResponse() throws IOException, ParseException
IOException
ParseException
protected URI writeRequest(URI id) throws IOException, ParseException
IOException
ParseException
protected String generateJson() throws IOException
generateJson
in class AbstractCommonCrawlFormat
IOException
protected void writeKeyValue(String key, String value) throws IOException
writeKeyValue
in class AbstractCommonCrawlFormat
IOException
protected void writeKeyNull(String key) throws IOException
writeKeyNull
in class AbstractCommonCrawlFormat
IOException
protected void startArray(String key, boolean nested, boolean newline) throws IOException
startArray
in class AbstractCommonCrawlFormat
IOException
protected void closeArray(String key, boolean nested, boolean newline) throws IOException
closeArray
in class AbstractCommonCrawlFormat
IOException
protected void writeArrayValue(String value) throws IOException
writeArrayValue
in class AbstractCommonCrawlFormat
IOException
protected void startObject(String key) throws IOException
startObject
in class AbstractCommonCrawlFormat
IOException
protected void closeObject(String key) throws IOException
closeObject
in class AbstractCommonCrawlFormat
IOException
public void close()
CommonCrawlFormat
close
in interface Closeable
close
in interface AutoCloseable
close
in interface CommonCrawlFormat
close
in class AbstractCommonCrawlFormat
Copyright © 2021 The Apache Software Foundation