public abstract class AbstractCommonCrawlFormat extends Object implements CommonCrawlFormat
Modifier and Type | Field and Description |
---|---|
protected Configuration |
conf |
protected Content |
content |
protected List<String> |
inLinks |
protected boolean |
jsonArray |
protected String |
keyPrefix |
protected static org.slf4j.Logger |
LOG |
protected Metadata |
metadata |
protected boolean |
reverseKey |
protected String |
reverseKeyValue |
protected boolean |
simpleDateFormat |
protected String |
url |
Constructor and Description |
---|
AbstractCommonCrawlFormat(String url,
Content content,
Metadata metadata,
Configuration nutchConf,
CommonCrawlConfig config) |
Modifier and Type | Method and Description |
---|---|
void |
close()
Optional method that could be implemented if the actual format needs some
close procedure.
|
protected abstract void |
closeArray(String key,
boolean nested,
boolean newline) |
protected abstract void |
closeObject(String key) |
protected abstract String |
generateJson() |
protected String |
getImported() |
List<String> |
getInLinks()
gets set of inlinks
|
String |
getJsonData() |
String |
getJsonData(String url,
Content content,
Metadata metadata)
Returns a string representation of the JSON structure of the URL content
|
String |
getJsonData(String url,
Content content,
Metadata metadata,
ParseData parseData)
Returns a string representation of the JSON structure of the URL content
takes into account the parsed metadata about the URL
|
protected String |
getKey() |
protected String |
getMethod() |
protected String |
getRequestAccept() |
protected String |
getRequestAcceptEncoding() |
protected String |
getRequestAcceptLanguage() |
protected String |
getRequestContactEmail() |
protected String |
getRequestContactName() |
protected String |
getRequestHostAddress() |
protected String |
getRequestHostName() |
protected String |
getRequestRobots() |
protected String |
getRequestSoftware() |
protected String |
getRequestUserAgent() |
protected String |
getResponseAddress() |
protected String |
getResponseContent() |
protected String |
getResponseContentEncoding() |
protected String |
getResponseContentType() |
protected String |
getResponseDate() |
protected String |
getResponseHostName() |
protected String |
getResponseServer() |
protected String |
getResponseStatus() |
protected String |
getTimestamp() |
protected String |
getUrl() |
void |
setInLinks(List<String> inLinks)
sets inlinks of this document
|
protected abstract void |
startArray(String key,
boolean nested,
boolean newline) |
protected abstract void |
startObject(String key) |
protected abstract void |
writeArrayValue(String value) |
protected abstract void |
writeKeyNull(String key) |
protected abstract void |
writeKeyValue(String key,
String value) |
protected static final org.slf4j.Logger LOG
protected String url
protected Content content
protected Metadata metadata
protected Configuration conf
protected String keyPrefix
protected boolean simpleDateFormat
protected boolean jsonArray
protected boolean reverseKey
protected String reverseKeyValue
public AbstractCommonCrawlFormat(String url, Content content, Metadata metadata, Configuration nutchConf, CommonCrawlConfig config) throws IOException
IOException
public String getJsonData(String url, Content content, Metadata metadata) throws IOException
CommonCrawlFormat
getJsonData
in interface CommonCrawlFormat
IOException
public String getJsonData(String url, Content content, Metadata metadata, ParseData parseData) throws IOException
CommonCrawlFormat
getJsonData
in interface CommonCrawlFormat
IOException
public String getJsonData() throws IOException
getJsonData
in interface CommonCrawlFormat
IOException
protected abstract void writeKeyValue(String key, String value) throws IOException
IOException
protected abstract void writeKeyNull(String key) throws IOException
IOException
protected abstract void startArray(String key, boolean nested, boolean newline) throws IOException
IOException
protected abstract void closeArray(String key, boolean nested, boolean newline) throws IOException
IOException
protected abstract void writeArrayValue(String value) throws IOException
IOException
protected abstract void startObject(String key) throws IOException
IOException
protected abstract void closeObject(String key) throws IOException
IOException
protected abstract String generateJson() throws IOException
IOException
protected String getUrl()
protected String getTimestamp()
protected String getMethod()
protected String getRequestHostName()
protected String getRequestHostAddress()
protected String getRequestSoftware()
protected String getRequestRobots()
protected String getRequestContactName()
protected String getRequestContactEmail()
protected String getRequestAccept()
protected String getRequestAcceptEncoding()
protected String getRequestAcceptLanguage()
protected String getRequestUserAgent()
protected String getResponseStatus()
protected String getResponseHostName()
protected String getResponseAddress()
protected String getResponseContentEncoding()
protected String getResponseContentType()
public List<String> getInLinks()
CommonCrawlFormat
getInLinks
in interface CommonCrawlFormat
public void setInLinks(List<String> inLinks)
CommonCrawlFormat
setInLinks
in interface CommonCrawlFormat
inLinks
- list of inlinksprotected String getResponseDate()
protected String getResponseServer()
protected String getResponseContent()
protected String getKey()
protected String getImported()
public void close()
CommonCrawlFormat
close
in interface Closeable
close
in interface AutoCloseable
close
in interface CommonCrawlFormat
Copyright © 2021 The Apache Software Foundation