public class IncrementalIngester extends BaseTable implements IIncrementalIngester
Field | Type | Description |
---|---|---|
id | BIGINT | Primary Key |
connectionname | VARCHAR(32) | Reference:outputconnections.connectionname |
dockey | VARCHAR(73) | |
componenthash | VARCHAR(40) | |
docuri | LONGTEXT | |
urihash | VARCHAR(40) | |
lastversion | LONGTEXT | |
lastoutputversion | LONGTEXT | |
lasttransformationversion | LONGTEXT | |
changecount | BIGINT | |
firstingest | BIGINT | |
lastingest | BIGINT | |
authorityname | VARCHAR(32) |
Modifier and Type | Class and Description |
---|---|
protected static class |
IncrementalIngester.DeleteInfo
This class contains the information necessary to delete a document
|
protected static class |
IncrementalIngester.MonitoredAddActivityWrapper
This class passes everything through, and monitors what happens so that the
framework can compensate for any transformation connector coding errors.
|
protected static class |
IncrementalIngester.OutputActivitiesWrapper |
protected static class |
IncrementalIngester.OutputAddActivitiesWrapper |
class |
IncrementalIngester.OutputAddEntryPoint |
protected static class |
IncrementalIngester.OutputRecordingActivity
Wrapper class for add activity.
|
protected static class |
IncrementalIngester.OutputRemoveActivitiesWrapper |
static class |
IncrementalIngester.PipelineAddEntryPoint
This class describes the entry stage of an add pipeline.
|
static class |
IncrementalIngester.PipelineAddFanout
This class describes the entry stage of multiple siblings in an add pipeline.
|
static class |
IncrementalIngester.PipelineCheckEntryPoint
This class describes the entry stage of a check pipeline.
|
static class |
IncrementalIngester.PipelineCheckFanout
This class describes the entry stage of multiple siblings in a check pipeline.
|
protected class |
IncrementalIngester.PipelineObject |
protected class |
IncrementalIngester.PipelineObjectWithVersions |
protected static class |
IncrementalIngester.TransformationRecordingActivity
Wrapper class for add activity.
|
Modifier and Type | Field and Description |
---|---|
static java.lang.String |
_rcsid |
protected static java.lang.String |
authorityNameField |
protected static java.lang.String |
changeCountField |
protected static java.lang.String |
componentHashField |
protected IOutputConnectionManager |
connectionManager |
protected static java.lang.String |
docKeyField |
protected static java.lang.String |
docURIField |
protected static java.lang.String |
firstIngestField |
protected static java.lang.String |
idField |
protected static java.lang.String |
lastIngestField |
protected static java.lang.String |
lastOutputVersionField |
protected static java.lang.String |
lastTransformationVersionField |
protected static java.lang.String |
lastVersionField |
protected ILockManager |
lockManager |
protected IOutputConnectorPool |
outputConnectorPool |
protected static java.lang.String |
outputConnNameField |
protected IThreadContext |
threadContext |
protected ITransformationConnectorPool |
transformationConnectorPool |
protected static java.lang.String |
uriHashField |
dbInterface, tableName
Constructor and Description |
---|
IncrementalIngester(IThreadContext threadContext,
IDBInterface database)
Constructor.
|
Modifier and Type | Method and Description |
---|---|
boolean |
checkDateIndexable(IPipelineSpecification pipelineSpecification,
java.util.Date date,
IOutputCheckActivity activity)
Check if a date is indexable.
|
boolean |
checkDocumentIndexable(IPipelineSpecification pipelineSpecification,
java.io.File localFile,
IOutputCheckActivity activity)
Check if a file is indexable.
|
boolean |
checkFetchDocument(IPipelineSpecificationWithVersions pipelineSpecificationWithVersions,
java.lang.String newDocumentVersion,
java.lang.String newAuthorityNameString)
Determine whether we need to fetch or refetch a document.
|
boolean |
checkLengthIndexable(IPipelineSpecification pipelineSpecification,
long length,
IOutputCheckActivity activity)
Pre-determine whether a document's length is indexable by this connector.
|
boolean |
checkMimeTypeIndexable(IPipelineSpecification pipelineSpecification,
java.lang.String mimeType,
IOutputCheckActivity activity)
Check if a mime type is indexable.
|
boolean |
checkURLIndexable(IPipelineSpecification pipelineSpecification,
java.lang.String url,
IOutputCheckActivity activity)
Pre-determine whether a document's URL is indexable by this connector.
|
void |
clearAll()
Flush all knowledge of what was ingested before.
|
protected static java.lang.String[] |
computeLockArray(java.lang.String documentURIHash,
java.lang.String oldURIHash,
java.lang.String outputConnectionName) |
protected static java.lang.String |
computePackedTransformationVersion(IPipelineSpecification pipelineSpecification,
int stage)
Compute a transformation version given a pipeline specification and starting output stage.
|
protected static java.lang.String |
createURILockName(java.lang.String outputConnectionName,
java.lang.String uriHash) |
void |
deinstall()
Uninstall the incremental ingestion manager.
|
protected void |
deleteRowIds(java.util.List<java.lang.Long> list)
Delete a chunk of row ids.
|
void |
documentCheck(IPipelineSpecificationBasic pipelineSpecificationBasic,
java.lang.String identifierClass,
java.lang.String identifierHash,
long checkTime)
Note the fact that we checked a document (and found that it did not need to be ingested, because the
versions agreed).
|
void |
documentCheckMultiple(IPipelineSpecificationBasic pipelineSpecificationBasic,
java.lang.String[] identifierClasses,
java.lang.String[] identifierHashes,
long checkTime)
Note the fact that we checked a document (and found that it did not need to be ingested, because the
versions agreed).
|
void |
documentDelete(IPipelineConnections pipelineConnections,
java.lang.String identifierClass,
java.lang.String identifierHash,
IOutputRemoveActivity activities)
Delete a document from the search engine index.
|
void |
documentDeleteMultiple(IPipelineConnections[] pipelineConnections,
java.lang.String[] identifierClasses,
java.lang.String[] identifierHashes,
IOutputRemoveActivity activities)
Delete multiple documents from the search engine index.
|
void |
documentDeleteMultiple(IPipelineConnections pipelineConnections,
java.lang.String[] identifierClasses,
java.lang.String[] identifierHashes,
IOutputRemoveActivity originalActivities)
Delete multiple documents from the search engine index.
|
boolean |
documentIngest(IPipelineSpecificationWithVersions pipelineSpecificationWithVersions,
java.lang.String identifierClass,
java.lang.String identifierHash,
java.lang.String componentHash,
java.lang.String documentVersion,
java.lang.String authorityName,
RepositoryDocument data,
long ingestTime,
java.lang.String documentURI,
IOutputActivity activities)
Ingest a document.
|
void |
documentNoData(IPipelineSpecificationWithVersions pipelineSpecificationWithVersions,
java.lang.String identifierClass,
java.lang.String identifierHash,
java.lang.String componentHash,
java.lang.String documentVersion,
java.lang.String authorityName,
long recordTime,
IOutputActivity activities)
Remove a document from specified indexes, just as if an empty document
was indexed, and record the necessary version information.
|
void |
documentRecord(IPipelineSpecificationBasic pipelineSpecificationBasic,
java.lang.String identifierClass,
java.lang.String identifierHash,
java.lang.String componentHash,
java.lang.String documentVersion,
long recordTime)
Record a document version, but don't ingest it.
|
void |
documentRemove(IPipelineConnections pipelineConnections,
java.lang.String identifierClass,
java.lang.String identifierHash,
java.lang.String componentHash,
IOutputRemoveActivity activities)
Remove a document component from the search engine index.
|
void |
documentRemoveMultiple(IPipelineConnections pipelineConnections,
java.lang.String[] identifierClasses,
java.lang.String[] identifierHashes,
java.lang.String componentHash,
IOutputRemoveActivity activities)
Remove multiple document components from the search engine index.
|
protected static java.lang.String[] |
extractOutputConnectionNames(IPipelineSpecificationBasic pipelineSpecificationBasic) |
protected void |
findRowIdsForDocIds(java.lang.String[] outputConnectionNames,
java.util.Set<java.lang.Long> rowIDSet,
java.util.List<java.lang.String> paramValues)
Given values and parameters corresponding to a set of hash values, add corresponding
table row id's to the output map.
|
protected void |
findRowIdsForDocIds(java.lang.String outputConnectionName,
java.util.Set<java.lang.Long> rowIDSet,
java.util.List<java.lang.String> paramValues)
Given values and parameters corresponding to a set of hash values, add corresponding
table row id's to the output map.
|
protected void |
findRowIdsForDocIds(java.lang.String outputConnectionName,
java.util.Set<java.lang.Long> rowIDSet,
java.util.List<java.lang.String> paramValues,
java.lang.String componentHash)
Given values and parameters corresponding to a set of hash values, add corresponding
table row id's to the output map.
|
protected void |
findRowIdsForURIs(java.lang.String outputConnectionName,
java.util.Set<java.lang.Long> rowIDSet,
java.util.Set<java.lang.String> uris,
java.util.List<java.lang.String> hashParamValues)
Given values and parameters corresponding to a set of hash values, add corresponding
table row id's to the output map.
|
long |
getDocumentUpdateInterval(IPipelineSpecificationBasic pipelineSpecificationBasic,
java.lang.String identifierClass,
java.lang.String identifierHash)
Calculate the average time interval between changes for a document.
|
long[] |
getDocumentUpdateIntervalMultiple(IPipelineSpecificationBasic pipelineSpecificationBasic,
java.lang.String[] identifierClasses,
java.lang.String[] identifierHashes)
Calculate the average time interval between changes for a document.
|
protected void |
getDocumentURIChunk(java.util.List<IncrementalIngester.DeleteInfo> rval,
java.lang.String outputConnectionName,
java.util.List<java.lang.String> list)
Get a chunk of document uris.
|
protected void |
getDocumentURIChunk(java.util.List<IncrementalIngester.DeleteInfo> rval,
java.lang.String outputConnectionName,
java.util.List<java.lang.String> list,
java.lang.String componentHash)
Get a chunk of document uris.
|
protected java.util.List<IncrementalIngester.DeleteInfo> |
getDocumentURIMultiple(java.lang.String outputConnectionName,
java.lang.String[] identifierClasses,
java.lang.String[] identifierHashes)
Find out what URIs a SET of document URIs are currently ingested.
|
protected java.util.List<IncrementalIngester.DeleteInfo> |
getDocumentURIMultiple(java.lang.String outputConnectionName,
java.lang.String[] identifierClasses,
java.lang.String[] identifierHashes,
java.lang.String componentHash)
Find out what URIs a SET of document URIs are currently ingested.
|
java.lang.String |
getFirstIndexedOutputConnectionName(IPipelineSpecificationBasic pipelineSpecificationBasic)
From a pipeline specification, get the name of the output connection that will be indexed first
in the pipeline.
|
protected void |
getIntervals(long[] rval,
java.lang.String[] outputConnectionNames,
java.util.List<java.lang.String> list,
java.util.Map<java.lang.String,java.lang.Integer> returnMap)
Query for and calculate the interval for a bunch of hashcodes.
|
java.lang.String |
getLastIndexedOutputConnectionName(IPipelineSpecificationBasic pipelineSpecificationBasic)
From a pipeline specification, get the name of the output connection that will be indexed last
in the pipeline.
|
VersionContext |
getOutputDescription(IOutputConnection outputConnection,
Specification spec)
Get an output version string for a document.
|
void |
getPipelineDocumentIngestData(IngestStatuses rval,
IPipelineSpecificationBasic pipelineSpecificationBasic,
java.lang.String identifierClass,
java.lang.String identifierHash)
Look up ingestion data for a document.
|
protected void |
getPipelineDocumentIngestDataChunk(IngestStatuses rval,
java.util.Map<java.lang.String,java.lang.Integer> map,
java.lang.String[] outputConnectionNames,
java.util.List<java.lang.String> list,
java.lang.String[] identifierClasses,
java.lang.String[] identifierHashes)
Get a chunk of document ingest data records.
|
void |
getPipelineDocumentIngestDataMultiple(IngestStatuses rval,
IPipelineSpecificationBasic[] pipelineSpecificationBasics,
java.lang.String[] identifierClasses,
java.lang.String[] identifierHashes)
Look up ingestion data for a set of documents.
|
void |
getPipelineDocumentIngestDataMultiple(IngestStatuses rval,
IPipelineSpecificationBasic pipelineSpecificationBasic,
java.lang.String[] identifierClasses,
java.lang.String[] identifierHashes)
Look up ingestion data for a SET of documents.
|
VersionContext |
getTransformationDescription(ITransformationConnection transformationConnection,
Specification spec)
Get transformation version string for a document.
|
void |
install()
Install the incremental ingestion manager.
|
protected static java.lang.String |
makeKey(java.lang.String documentClass,
java.lang.String documentHash)
Make a key from a document class and a hash
|
protected int |
maxClauseDocumentIngestDataChunk(java.lang.String outputConnectionName)
Count the clauses
|
protected int |
maxClauseDocumentURIChunk(java.lang.String outputConnectionName)
Calculate how many clauses at a time
|
protected int |
maxClauseDocumentURIChunk(java.lang.String outputConnectionName,
java.lang.String componentHash)
Calculate how many clauses at a time
|
protected int |
maxClauseGetIntervals(java.lang.String[] outputConnectionNames)
Calculate the number of clauses.
|
protected int |
maxClausePipelineDocumentIngestDataChunk(java.lang.String[] outputConnectionNames)
Count the clauses
|
protected int |
maxClausesDeleteRowIds()
Calculate the maximum number of clauses.
|
protected int |
maxClausesRowIdsForDocIds(java.lang.String outputConnectionName)
Calculate the maximum number of doc ids we should use.
|
protected int |
maxClausesRowIdsForDocIds(java.lang.String[] outputConnectionNames)
Calculate the maximum number of doc ids we should use.
|
protected int |
maxClausesRowIdsForDocIds(java.lang.String outputConnectionName,
java.lang.String componentHash)
Calculate the maximum number of doc ids we should use.
|
protected int |
maxClausesRowIdsForURIs(java.lang.String outputConnectionName)
Calculate the clauses.
|
protected int |
maxClausesUpdateRowIds()
Calculate the number of clauses.
|
protected void |
noteDocumentIngest(java.lang.String outputConnectionName,
java.lang.String docKey,
java.lang.String componentHash,
java.lang.String documentVersion,
java.lang.String transformationVersion,
java.lang.String outputVersion,
java.lang.String authorityNameString,
long ingestTime,
java.lang.String documentURI,
java.lang.String documentURIHash)
Note the ingestion of a document, or the "update" of a document.
|
protected static void |
pack(java.lang.StringBuilder sb,
java.lang.String value,
char delim) |
protected static void |
packList(java.lang.StringBuilder output,
java.lang.String[] values,
char delimiter) |
protected IncrementalIngester.PipelineObject |
pipelineGrab(IPipelineSpecification pipelineConnections)
Grab the entire pipeline.
|
protected IncrementalIngester.PipelineObjectWithVersions |
pipelineGrabWithVersions(IPipelineSpecificationWithVersions pipelineConnections)
Grab the entire pipeline.
|
protected void |
removeDocument(IOutputConnection connection,
java.lang.String documentURI,
java.lang.String outputDescription,
IOutputRemoveActivity activities)
Remove document, using the specified output connection, via the standard pool.
|
void |
removeOutputConnection(IOutputConnection outputConnection)
Remove all knowledge of an output index from the system.
|
void |
resetOutputConnection(IOutputConnection outputConnection)
Reset all documents belonging to a specific output connection, because we've got information that
that system has been reconfigured.
|
protected void |
updateRowIds(java.util.List<java.lang.Long> list,
long checkTime)
Update a chunk of row ids.
|
addTableIndex, analyzeTable, beginTransaction, buildConjunctionClause, constructCountClause, constructDistinctOnClause, constructDoubleCastClause, constructOffsetLimitClause, constructRegexpClause, constructSubstringClause, endTransaction, findConjunctionClauseMax, getDatabaseCacheKey, getDBInterface, getMaxInClause, getMaxOrClause, getSleepAmt, getTableIndexes, getTableName, getTableSchema, getTransactionID, getWindowedReportMaxRows, makeTableKey, noteModifications, performAddIndex, performAlter, performCommit, performCreate, performDelete, performDrop, performInsert, performModification, performQuery, performQuery, performRemoveIndex, performUpdate, prepareRowForSave, readRow, reindexTable, signalRollback, sleepFor
public static final java.lang.String _rcsid
protected static final java.lang.String idField
protected static final java.lang.String outputConnNameField
protected static final java.lang.String docKeyField
protected static final java.lang.String componentHashField
protected static final java.lang.String docURIField
protected static final java.lang.String uriHashField
protected static final java.lang.String lastVersionField
protected static final java.lang.String lastOutputVersionField
protected static final java.lang.String lastTransformationVersionField
protected static final java.lang.String changeCountField
protected static final java.lang.String firstIngestField
protected static final java.lang.String lastIngestField
protected static final java.lang.String authorityNameField
protected final IThreadContext threadContext
protected final ILockManager lockManager
protected final IOutputConnectionManager connectionManager
protected final IOutputConnectorPool outputConnectorPool
protected final ITransformationConnectorPool transformationConnectorPool
public IncrementalIngester(IThreadContext threadContext, IDBInterface database) throws ManifoldCFException
ManifoldCFException
public void install() throws ManifoldCFException
install
in interface IIncrementalIngester
ManifoldCFException
public void deinstall() throws ManifoldCFException
deinstall
in interface IIncrementalIngester
ManifoldCFException
public void clearAll() throws ManifoldCFException
clearAll
in interface IIncrementalIngester
ManifoldCFException
public java.lang.String getLastIndexedOutputConnectionName(IPipelineSpecificationBasic pipelineSpecificationBasic)
getLastIndexedOutputConnectionName
in interface IIncrementalIngester
pipelineSpecificationBasic
- is the basic pipeline specification.public java.lang.String getFirstIndexedOutputConnectionName(IPipelineSpecificationBasic pipelineSpecificationBasic)
getFirstIndexedOutputConnectionName
in interface IIncrementalIngester
pipelineSpecificationBasic
- is the basic pipeline specification.public boolean checkDateIndexable(IPipelineSpecification pipelineSpecification, java.util.Date date, IOutputCheckActivity activity) throws ManifoldCFException, ServiceInterruption
checkDateIndexable
in interface IIncrementalIngester
pipelineSpecification
- is the IPipelineSpecification object for this pipeline.date
- is the date to check.activity
- are the activities available to this method.ManifoldCFException
ServiceInterruption
public boolean checkMimeTypeIndexable(IPipelineSpecification pipelineSpecification, java.lang.String mimeType, IOutputCheckActivity activity) throws ManifoldCFException, ServiceInterruption
checkMimeTypeIndexable
in interface IIncrementalIngester
pipelineSpecification
- is the IPipelineSpecification object for this pipeline.mimeType
- is the mime type to check.activity
- are the activities available to this method.ManifoldCFException
ServiceInterruption
public boolean checkDocumentIndexable(IPipelineSpecification pipelineSpecification, java.io.File localFile, IOutputCheckActivity activity) throws ManifoldCFException, ServiceInterruption
checkDocumentIndexable
in interface IIncrementalIngester
pipelineSpecification
- is the IPipelineSpecification object for this pipeline.localFile
- is the local file to check.activity
- are the activities available to this method.ManifoldCFException
ServiceInterruption
public boolean checkLengthIndexable(IPipelineSpecification pipelineSpecification, long length, IOutputCheckActivity activity) throws ManifoldCFException, ServiceInterruption
checkLengthIndexable
in interface IIncrementalIngester
pipelineSpecification
- is the IPipelineSpecification object for this pipeline.length
- is the length of the document.activity
- are the activities available to this method.ManifoldCFException
ServiceInterruption
public boolean checkURLIndexable(IPipelineSpecification pipelineSpecification, java.lang.String url, IOutputCheckActivity activity) throws ManifoldCFException, ServiceInterruption
checkURLIndexable
in interface IIncrementalIngester
pipelineSpecification
- is the IPipelineSpecification object for this pipeline.url
- is the url of the document.activity
- are the activities available to this method.ManifoldCFException
ServiceInterruption
protected IncrementalIngester.PipelineObjectWithVersions pipelineGrabWithVersions(IPipelineSpecificationWithVersions pipelineConnections) throws ManifoldCFException
pipelineConnections
- - the pipeline specification with version informationManifoldCFException
protected IncrementalIngester.PipelineObject pipelineGrab(IPipelineSpecification pipelineConnections) throws ManifoldCFException
pipelineConnections
- - the pipeline specificationManifoldCFException
public VersionContext getOutputDescription(IOutputConnection outputConnection, Specification spec) throws ManifoldCFException, ServiceInterruption
getOutputDescription
in interface IIncrementalIngester
outputConnection
- is the output connection associated with this action.spec
- is the output specification.ManifoldCFException
ServiceInterruption
public VersionContext getTransformationDescription(ITransformationConnection transformationConnection, Specification spec) throws ManifoldCFException, ServiceInterruption
getTransformationDescription
in interface IIncrementalIngester
transformationConnection
- is the transformation connection associated with this action.spec
- is the transformation specification.ManifoldCFException
ServiceInterruption
public boolean checkFetchDocument(IPipelineSpecificationWithVersions pipelineSpecificationWithVersions, java.lang.String newDocumentVersion, java.lang.String newAuthorityNameString)
checkFetchDocument
in interface IIncrementalIngester
pipelineSpecificationWithVersions
- is the pipeline specification including new version info for all transformation and output
connections.newDocumentVersion
- is the newly-determined document version.newAuthorityNameString
- is the newly-determined authority name.protected static java.lang.String computePackedTransformationVersion(IPipelineSpecification pipelineSpecification, int stage)
pipelineSpecification
- is the pipeline specification.stage
- is the stage number of the output stage.protected static void packList(java.lang.StringBuilder output, java.lang.String[] values, char delimiter)
protected static void pack(java.lang.StringBuilder sb, java.lang.String value, char delim)
public void documentRecord(IPipelineSpecificationBasic pipelineSpecificationBasic, java.lang.String identifierClass, java.lang.String identifierHash, java.lang.String componentHash, java.lang.String documentVersion, long recordTime) throws ManifoldCFException
documentRecord
in interface IIncrementalIngester
pipelineSpecificationBasic
- is the basic pipeline specification needed.identifierClass
- is the name of the space in which the identifier hash should be interpreted.identifierHash
- is the hashed document identifier.componentHash
- is the hashed component identifier, if any.documentVersion
- is the document version.recordTime
- is the time at which the recording took place, in milliseconds since epoch.ManifoldCFException
public void documentNoData(IPipelineSpecificationWithVersions pipelineSpecificationWithVersions, java.lang.String identifierClass, java.lang.String identifierHash, java.lang.String componentHash, java.lang.String documentVersion, java.lang.String authorityName, long recordTime, IOutputActivity activities) throws ManifoldCFException, ServiceInterruption
documentNoData
in interface IIncrementalIngester
pipelineSpecificationWithVersions
- is the pipeline specification with already-fetched output versioning information.identifierClass
- is the name of the space in which the identifier hash should be interpreted.identifierHash
- is the hashed document identifier.componentHash
- is the hashed component identifier, if any.documentVersion
- is the document version.authorityName
- is the name of the authority associated with the document, if any.recordTime
- is the time at which the recording took place, in milliseconds since epoch.activities
- is an object providing a set of methods that the implementer can use to perform the operation.ManifoldCFException
ServiceInterruption
public boolean documentIngest(IPipelineSpecificationWithVersions pipelineSpecificationWithVersions, java.lang.String identifierClass, java.lang.String identifierHash, java.lang.String componentHash, java.lang.String documentVersion, java.lang.String authorityName, RepositoryDocument data, long ingestTime, java.lang.String documentURI, IOutputActivity activities) throws ManifoldCFException, ServiceInterruption, java.io.IOException
documentIngest
in interface IIncrementalIngester
pipelineSpecificationWithVersions
- is the pipeline specification with already-fetched output versioning information.identifierClass
- is the name of the space in which the identifier hash should be interpreted.identifierHash
- is the hashed document identifier.componentHash
- is the hashed component identifier, if any.documentVersion
- is the document version.authorityName
- is the name of the authority associated with the document, if any.data
- is the document data. The data is closed after ingestion is complete.ingestTime
- is the time at which the ingestion took place, in milliseconds since epoch.documentURI
- is the URI of the document, which will be used as the key of the document in the index.activities
- is an object providing a set of methods that the implementer can use to perform the operation.java.io.IOException
- only if data stream throws an IOException.ManifoldCFException
ServiceInterruption
public void documentRemove(IPipelineConnections pipelineConnections, java.lang.String identifierClass, java.lang.String identifierHash, java.lang.String componentHash, IOutputRemoveActivity activities) throws ManifoldCFException, ServiceInterruption
documentRemove
in interface IIncrementalIngester
pipelineConnections
- is the pipeline specification.identifierClass
- is the name of the space in which the identifier hash should be interpreted.identifierHash
- is the hash of the id of the document.componentHash
- is the hashed component identifier, if any.activities
- is the object to use to log the details of the ingestion attempt. May be null.ManifoldCFException
ServiceInterruption
protected static java.lang.String[] extractOutputConnectionNames(IPipelineSpecificationBasic pipelineSpecificationBasic)
public void documentCheckMultiple(IPipelineSpecificationBasic pipelineSpecificationBasic, java.lang.String[] identifierClasses, java.lang.String[] identifierHashes, long checkTime) throws ManifoldCFException
documentCheckMultiple
in interface IIncrementalIngester
pipelineSpecificationBasic
- is a pipeline specification.identifierClasses
- are the names of the spaces in which the identifier hashes should be interpreted.identifierHashes
- are the set of document identifier hashes.checkTime
- is the time at which the check took place, in milliseconds since epoch.ManifoldCFException
public void documentCheck(IPipelineSpecificationBasic pipelineSpecificationBasic, java.lang.String identifierClass, java.lang.String identifierHash, long checkTime) throws ManifoldCFException
documentCheck
in interface IIncrementalIngester
pipelineSpecificationBasic
- is a basic pipeline specification.identifierClass
- is the name of the space in which the identifier hash should be interpreted.identifierHash
- is the hashed document identifier.checkTime
- is the time at which the check took place, in milliseconds since epoch.ManifoldCFException
protected int maxClausesUpdateRowIds()
protected void updateRowIds(java.util.List<java.lang.Long> list, long checkTime) throws ManifoldCFException
ManifoldCFException
public void documentDeleteMultiple(IPipelineConnections[] pipelineConnections, java.lang.String[] identifierClasses, java.lang.String[] identifierHashes, IOutputRemoveActivity activities) throws ManifoldCFException, ServiceInterruption
documentDeleteMultiple
in interface IIncrementalIngester
pipelineConnections
- are the pipeline specifications associated with the documents.identifierClasses
- are the names of the spaces in which the identifier hashes should be interpreted.identifierHashes
- is tha array of document identifier hashes if the documents.activities
- is the object to use to log the details of the ingestion attempt. May be null.ManifoldCFException
ServiceInterruption
protected static java.lang.String createURILockName(java.lang.String outputConnectionName, java.lang.String uriHash)
public void documentDeleteMultiple(IPipelineConnections pipelineConnections, java.lang.String[] identifierClasses, java.lang.String[] identifierHashes, IOutputRemoveActivity originalActivities) throws ManifoldCFException, ServiceInterruption
documentDeleteMultiple
in interface IIncrementalIngester
pipelineConnections
- is the pipeline specification.identifierClasses
- are the names of the spaces in which the identifier hashes should be interpreted.identifierHashes
- is tha array of document identifier hashes if the documents.originalActivities
- is the object to use to log the details of the ingestion attempt. May be null.ManifoldCFException
ServiceInterruption
public void documentRemoveMultiple(IPipelineConnections pipelineConnections, java.lang.String[] identifierClasses, java.lang.String[] identifierHashes, java.lang.String componentHash, IOutputRemoveActivity activities) throws ManifoldCFException, ServiceInterruption
documentRemoveMultiple
in interface IIncrementalIngester
pipelineConnections
- is the pipeline specification.identifierClasses
- are the names of the spaces in which the identifier hash should be interpreted.identifierHashes
- are the hashes of the ids of the documents.componentHash
- is the hashed component identifier, if any.activities
- is the object to use to log the details of the ingestion attempt. May be null.ManifoldCFException
ServiceInterruption
protected int maxClausesRowIdsForURIs(java.lang.String outputConnectionName)
protected void findRowIdsForURIs(java.lang.String outputConnectionName, java.util.Set<java.lang.Long> rowIDSet, java.util.Set<java.lang.String> uris, java.util.List<java.lang.String> hashParamValues) throws ManifoldCFException
ManifoldCFException
protected int maxClausesRowIdsForDocIds(java.lang.String outputConnectionName)
protected int maxClausesRowIdsForDocIds(java.lang.String outputConnectionName, java.lang.String componentHash)
protected int maxClausesRowIdsForDocIds(java.lang.String[] outputConnectionNames)
protected void findRowIdsForDocIds(java.lang.String outputConnectionName, java.util.Set<java.lang.Long> rowIDSet, java.util.List<java.lang.String> paramValues) throws ManifoldCFException
ManifoldCFException
protected void findRowIdsForDocIds(java.lang.String outputConnectionName, java.util.Set<java.lang.Long> rowIDSet, java.util.List<java.lang.String> paramValues, java.lang.String componentHash) throws ManifoldCFException
ManifoldCFException
protected void findRowIdsForDocIds(java.lang.String[] outputConnectionNames, java.util.Set<java.lang.Long> rowIDSet, java.util.List<java.lang.String> paramValues) throws ManifoldCFException
ManifoldCFException
protected int maxClausesDeleteRowIds()
protected void deleteRowIds(java.util.List<java.lang.Long> list) throws ManifoldCFException
ManifoldCFException
public void documentDelete(IPipelineConnections pipelineConnections, java.lang.String identifierClass, java.lang.String identifierHash, IOutputRemoveActivity activities) throws ManifoldCFException, ServiceInterruption
documentDelete
in interface IIncrementalIngester
pipelineConnections
- is the pipeline specification.identifierClass
- is the name of the space in which the identifier hash should be interpreted.identifierHash
- is the hash of the id of the document.activities
- is the object to use to log the details of the ingestion attempt. May be null.ManifoldCFException
ServiceInterruption
protected java.util.List<IncrementalIngester.DeleteInfo> getDocumentURIMultiple(java.lang.String outputConnectionName, java.lang.String[] identifierClasses, java.lang.String[] identifierHashes) throws ManifoldCFException
identifierHashes
- is the array of document id's to check.ManifoldCFException
protected java.util.List<IncrementalIngester.DeleteInfo> getDocumentURIMultiple(java.lang.String outputConnectionName, java.lang.String[] identifierClasses, java.lang.String[] identifierHashes, java.lang.String componentHash) throws ManifoldCFException
outputConnectionName
- is the output connection name.identifierClasses
- is the array of identifier classes.identifierHashes
- is the array of document id's to check.componentHash
- is the component hash to check.ManifoldCFException
public void getPipelineDocumentIngestDataMultiple(IngestStatuses rval, IPipelineSpecificationBasic[] pipelineSpecificationBasics, java.lang.String[] identifierClasses, java.lang.String[] identifierHashes) throws ManifoldCFException
getPipelineDocumentIngestDataMultiple
in interface IIncrementalIngester
rval
- is a map of output key to document data, in no particular order, which will be loaded with all matching results.pipelineSpecificationBasics
- are the pipeline specifications corresponding to the identifier classes and hashes.identifierClasses
- are the names of the spaces in which the identifier hashes should be interpreted.identifierHashes
- is the array of document identifier hashes to look up.ManifoldCFException
public void getPipelineDocumentIngestDataMultiple(IngestStatuses rval, IPipelineSpecificationBasic pipelineSpecificationBasic, java.lang.String[] identifierClasses, java.lang.String[] identifierHashes) throws ManifoldCFException
getPipelineDocumentIngestDataMultiple
in interface IIncrementalIngester
rval
- is a map of output key to document data, in no particular order, which will be loaded with all matching results.pipelineSpecificationBasic
- is the pipeline specification for all documents.identifierClasses
- are the names of the spaces in which the identifier hashes should be interpreted.identifierHashes
- is the array of document identifier hashes to look up.ManifoldCFException
protected void getPipelineDocumentIngestDataChunk(IngestStatuses rval, java.util.Map<java.lang.String,java.lang.Integer> map, java.lang.String[] outputConnectionNames, java.util.List<java.lang.String> list, java.lang.String[] identifierClasses, java.lang.String[] identifierHashes) throws ManifoldCFException
rval
- is the document ingest status array where the data should be put.map
- is the map from id to index.list
- is the parameter list for the query.ManifoldCFException
public void getPipelineDocumentIngestData(IngestStatuses rval, IPipelineSpecificationBasic pipelineSpecificationBasic, java.lang.String identifierClass, java.lang.String identifierHash) throws ManifoldCFException
getPipelineDocumentIngestData
in interface IIncrementalIngester
rval
- is a map of output key to document data, in no particular order, which will be loaded with all matching results.pipelineSpecificationBasic
- is the pipeline specification for the document.identifierClass
- is the name of the space in which the identifier hash should be interpreted.identifierHash
- is the hash of the id of the document.ManifoldCFException
public long[] getDocumentUpdateIntervalMultiple(IPipelineSpecificationBasic pipelineSpecificationBasic, java.lang.String[] identifierClasses, java.lang.String[] identifierHashes) throws ManifoldCFException
getDocumentUpdateIntervalMultiple
in interface IIncrementalIngester
pipelineSpecificationBasic
- is the basic pipeline specification.identifierClasses
- are the names of the spaces in which the identifier hashes should be interpreted.identifierHashes
- is the hashes of the ids of the documents.ManifoldCFException
public long getDocumentUpdateInterval(IPipelineSpecificationBasic pipelineSpecificationBasic, java.lang.String identifierClass, java.lang.String identifierHash) throws ManifoldCFException
getDocumentUpdateInterval
in interface IIncrementalIngester
pipelineSpecificationBasic
- is the basic pipeline specification.identifierClass
- is the name of the space in which the identifier hash should be interpreted.identifierHash
- is the hash of the id of the document.ManifoldCFException
protected int maxClauseGetIntervals(java.lang.String[] outputConnectionNames)
protected void getIntervals(long[] rval, java.lang.String[] outputConnectionNames, java.util.List<java.lang.String> list, java.util.Map<java.lang.String,java.lang.Integer> returnMap) throws ManifoldCFException
rval
- is the array to stuff calculated return values into.list
- is the list of parameters.returnMap
- is a mapping from document id to rval index.ManifoldCFException
public void resetOutputConnection(IOutputConnection outputConnection) throws ManifoldCFException
resetOutputConnection
in interface IIncrementalIngester
outputConnection
- is the output connection associated with this action.ManifoldCFException
public void removeOutputConnection(IOutputConnection outputConnection) throws ManifoldCFException
removeOutputConnection
in interface IIncrementalIngester
outputConnection
- is the output connection associated with this action.ManifoldCFException
protected void noteDocumentIngest(java.lang.String outputConnectionName, java.lang.String docKey, java.lang.String componentHash, java.lang.String documentVersion, java.lang.String transformationVersion, java.lang.String outputVersion, java.lang.String authorityNameString, long ingestTime, java.lang.String documentURI, java.lang.String documentURIHash) throws ManifoldCFException
outputConnectionName
- is the name of the output connection.docKey
- is the key string describing the document.componentHash
- is the component identifier hash for this document.documentVersion
- is a string describing the new version of the document.transformationVersion
- is a string describing all current transformations for the document.outputVersion
- is the version string calculated for the output connection.authorityNameString
- is the name of the relevant authority connection.ingestTime
- is the time at which the ingestion took place, in milliseconds since epoch.documentURI
- is the uri the document can be accessed at, or null (which signals that we are to record the version, but no
ingestion took place).documentURIHash
- is the hash of the document uri.ManifoldCFException
protected int maxClauseDocumentURIChunk(java.lang.String outputConnectionName)
protected void getDocumentURIChunk(java.util.List<IncrementalIngester.DeleteInfo> rval, java.lang.String outputConnectionName, java.util.List<java.lang.String> list) throws ManifoldCFException
rval
- is the string array where the uris should be put.list
- are the doc keys for the query.ManifoldCFException
protected int maxClauseDocumentURIChunk(java.lang.String outputConnectionName, java.lang.String componentHash)
protected void getDocumentURIChunk(java.util.List<IncrementalIngester.DeleteInfo> rval, java.lang.String outputConnectionName, java.util.List<java.lang.String> list, java.lang.String componentHash) throws ManifoldCFException
rval
- is the string array where the uris should be put.list
- are the doc keys for the query.componentHash
- is the component hash, if any, for the query.ManifoldCFException
protected int maxClauseDocumentIngestDataChunk(java.lang.String outputConnectionName)
protected int maxClausePipelineDocumentIngestDataChunk(java.lang.String[] outputConnectionNames)
protected void removeDocument(IOutputConnection connection, java.lang.String documentURI, java.lang.String outputDescription, IOutputRemoveActivity activities) throws ManifoldCFException, ServiceInterruption
protected static java.lang.String makeKey(java.lang.String documentClass, java.lang.String documentHash)
protected static java.lang.String[] computeLockArray(java.lang.String documentURIHash, java.lang.String oldURIHash, java.lang.String outputConnectionName)