public interface IIncrementalIngester
Modifier and Type | Field and Description |
---|---|
static java.lang.String |
_rcsid |
Modifier and Type | Method and Description |
---|---|
boolean |
checkDateIndexable(IPipelineSpecification pipelineSpecification,
java.util.Date date,
IOutputCheckActivity activity)
Check if a document date is indexable.
|
boolean |
checkDocumentIndexable(IPipelineSpecification pipelineSpecification,
java.io.File localFile,
IOutputCheckActivity activity)
Check if a file is indexable.
|
boolean |
checkFetchDocument(IPipelineSpecificationWithVersions pipelineSpecificationWithVersions,
java.lang.String newDocumentVersion,
java.lang.String newAuthorityNameString)
Determine whether we need to fetch or refetch a document.
|
boolean |
checkLengthIndexable(IPipelineSpecification pipelineSpecification,
long length,
IOutputCheckActivity activity)
Pre-determine whether a document's length is indexable by this connector.
|
boolean |
checkMimeTypeIndexable(IPipelineSpecification pipelineSpecification,
java.lang.String mimeType,
IOutputCheckActivity activity)
Check if a mime type is indexable.
|
boolean |
checkURLIndexable(IPipelineSpecification pipelineSpecification,
java.lang.String url,
IOutputCheckActivity activity)
Pre-determine whether a document's URL is indexable by this connector.
|
void |
clearAll()
Flush all knowledge of what was ingested before.
|
void |
deinstall()
Uninstall the incremental ingestion manager.
|
void |
documentCheck(IPipelineSpecificationBasic pipelineSpecificationBasic,
java.lang.String identifierClass,
java.lang.String identifierHash,
long checkTime)
Note the fact that we checked a document (and found that it did not need to be ingested, because the
versions agreed).
|
void |
documentCheckMultiple(IPipelineSpecificationBasic pipelineSpecificationBasic,
java.lang.String[] identifierClasses,
java.lang.String[] identifierHashes,
long checkTime)
Note the fact that we checked a document (and found that it did not need to be ingested, because the
versions agreed).
|
void |
documentDelete(IPipelineConnections pipelineConnections,
java.lang.String identifierClass,
java.lang.String identifierHash,
IOutputRemoveActivity activities)
Delete a document, and all its components, from the search engine index.
|
void |
documentDeleteMultiple(IPipelineConnections[] pipelineConnections,
java.lang.String[] identifierClasses,
java.lang.String[] identifierHashes,
IOutputRemoveActivity activities)
Delete multiple documents, and their components, from the search engine index.
|
void |
documentDeleteMultiple(IPipelineConnections pipelineConnections,
java.lang.String[] identifierClasses,
java.lang.String[] identifierHashes,
IOutputRemoveActivity activities)
Delete multiple documents, and their components, from the search engine index.
|
boolean |
documentIngest(IPipelineSpecificationWithVersions pipelineSpecificationWithVersions,
java.lang.String identifierClass,
java.lang.String identifierHash,
java.lang.String componentHash,
java.lang.String documentVersion,
java.lang.String authorityName,
RepositoryDocument data,
long ingestTime,
java.lang.String documentURI,
IOutputActivity activities)
Ingest a document.
|
void |
documentNoData(IPipelineSpecificationWithVersions pipelineSpecificationWithVersions,
java.lang.String identifierClass,
java.lang.String identifierHash,
java.lang.String componentHash,
java.lang.String documentVersion,
java.lang.String authorityName,
long recordTime,
IOutputActivity activities)
Remove a document from specified indexes, just as if an empty document
was indexed, and record the necessary version information.
|
void |
documentRecord(IPipelineSpecificationBasic pipelineSpecificationBasic,
java.lang.String identifierClass,
java.lang.String identifierHash,
java.lang.String componentHash,
java.lang.String documentVersion,
long recordTime)
Record a document version, but don't ingest it.
|
void |
documentRemove(IPipelineConnections pipelineConnections,
java.lang.String identifierClass,
java.lang.String identifierHash,
java.lang.String componentHash,
IOutputRemoveActivity activities)
Remove a document component from the search engine index.
|
void |
documentRemoveMultiple(IPipelineConnections pipelineConnections,
java.lang.String[] identifierClasses,
java.lang.String[] identifierHashes,
java.lang.String componentHash,
IOutputRemoveActivity activities)
Remove multiple document components from the search engine index.
|
long |
getDocumentUpdateInterval(IPipelineSpecificationBasic pipelineSpecificationBasic,
java.lang.String identifierClass,
java.lang.String identifierHash)
Calculate the average time interval between changes for a document.
|
long[] |
getDocumentUpdateIntervalMultiple(IPipelineSpecificationBasic pipelineSpecificationBasic,
java.lang.String[] identifierClasses,
java.lang.String[] identifierHashes)
Calculate the average time interval between changes for a document.
|
java.lang.String |
getFirstIndexedOutputConnectionName(IPipelineSpecificationBasic pipelineSpecificationBasic)
From a pipeline specification, get the name of the output connection that will be indexed first
in the pipeline.
|
java.lang.String |
getLastIndexedOutputConnectionName(IPipelineSpecificationBasic pipelineSpecificationBasic)
From a pipeline specification, get the name of the output connection that will be indexed last
in the pipeline.
|
VersionContext |
getOutputDescription(IOutputConnection outputConnection,
Specification spec)
Get an output version string for a document.
|
void |
getPipelineDocumentIngestData(IngestStatuses rval,
IPipelineSpecificationBasic pipelineSpecificationBasic,
java.lang.String identifierClass,
java.lang.String identifierHash)
Look up ingestion data for a document.
|
void |
getPipelineDocumentIngestDataMultiple(IngestStatuses rval,
IPipelineSpecificationBasic[] pipelineSpecificationBasics,
java.lang.String[] identifierClasses,
java.lang.String[] identifierHashes)
Look up ingestion data for a set of documents.
|
void |
getPipelineDocumentIngestDataMultiple(IngestStatuses rval,
IPipelineSpecificationBasic pipelineSpecificationBasic,
java.lang.String[] identifierClasses,
java.lang.String[] identifierHashes)
Look up ingestion data for a SET of documents.
|
VersionContext |
getTransformationDescription(ITransformationConnection transformationConnection,
Specification spec)
Get transformation version string for a document.
|
void |
install()
Install the incremental ingestion manager.
|
void |
removeOutputConnection(IOutputConnection outputConnection)
Remove all knowledge of an output index from the system.
|
void |
resetOutputConnection(IOutputConnection outputConnection)
Reset all documents belonging to a specific output connection, because we've got information that
that system has been reconfigured.
|
static final java.lang.String _rcsid
void install() throws ManifoldCFException
ManifoldCFException
void deinstall() throws ManifoldCFException
ManifoldCFException
void clearAll() throws ManifoldCFException
ManifoldCFException
java.lang.String getLastIndexedOutputConnectionName(IPipelineSpecificationBasic pipelineSpecificationBasic)
pipelineSpecificationBasic
- is the basic pipeline specification.java.lang.String getFirstIndexedOutputConnectionName(IPipelineSpecificationBasic pipelineSpecificationBasic)
pipelineSpecificationBasic
- is the basic pipeline specification.VersionContext getOutputDescription(IOutputConnection outputConnection, Specification spec) throws ManifoldCFException, ServiceInterruption
outputConnection
- is the output connection associated with this action.spec
- is the output specification.ManifoldCFException
ServiceInterruption
VersionContext getTransformationDescription(ITransformationConnection transformationConnection, Specification spec) throws ManifoldCFException, ServiceInterruption
transformationConnection
- is the transformation connection associated with this action.spec
- is the transformation specification.ManifoldCFException
ServiceInterruption
boolean checkDateIndexable(IPipelineSpecification pipelineSpecification, java.util.Date date, IOutputCheckActivity activity) throws ManifoldCFException, ServiceInterruption
pipelineSpecification
- is the IPipelineSpecification object for this pipeline.date
- is the date to checkactivity
- are the activities available to this method.ManifoldCFException
ServiceInterruption
boolean checkMimeTypeIndexable(IPipelineSpecification pipelineSpecification, java.lang.String mimeType, IOutputCheckActivity activity) throws ManifoldCFException, ServiceInterruption
pipelineSpecification
- is the IPipelineSpecification object for this pipeline.mimeType
- is the mime type to check.activity
- are the activities available to this method.ManifoldCFException
ServiceInterruption
boolean checkDocumentIndexable(IPipelineSpecification pipelineSpecification, java.io.File localFile, IOutputCheckActivity activity) throws ManifoldCFException, ServiceInterruption
pipelineSpecification
- is the IPipelineSpecification object for this pipeline.localFile
- is the local file to check.activity
- are the activities available to this method.ManifoldCFException
ServiceInterruption
boolean checkLengthIndexable(IPipelineSpecification pipelineSpecification, long length, IOutputCheckActivity activity) throws ManifoldCFException, ServiceInterruption
pipelineSpecification
- is the IPipelineSpecification object for this pipeline.length
- is the length of the document.activity
- are the activities available to this method.ManifoldCFException
ServiceInterruption
boolean checkURLIndexable(IPipelineSpecification pipelineSpecification, java.lang.String url, IOutputCheckActivity activity) throws ManifoldCFException, ServiceInterruption
pipelineSpecification
- is the IPipelineSpecification object for this pipeline.url
- is the url of the document.activity
- are the activities available to this method.ManifoldCFException
ServiceInterruption
boolean checkFetchDocument(IPipelineSpecificationWithVersions pipelineSpecificationWithVersions, java.lang.String newDocumentVersion, java.lang.String newAuthorityNameString)
pipelineSpecificationWithVersions
- is the pipeline specification including new version info for all transformation and output
connections.newDocumentVersion
- is the newly-determined document version.newAuthorityNameString
- is the newly-determined authority name.void documentRecord(IPipelineSpecificationBasic pipelineSpecificationBasic, java.lang.String identifierClass, java.lang.String identifierHash, java.lang.String componentHash, java.lang.String documentVersion, long recordTime) throws ManifoldCFException
pipelineSpecificationBasic
- is the basic pipeline specification needed.identifierClass
- is the name of the space in which the identifier hash should be interpreted.identifierHash
- is the hashed document identifier.componentHash
- is the hashed component identifier, if any.documentVersion
- is the document version.recordTime
- is the time at which the recording took place, in milliseconds since epoch.ManifoldCFException
void documentNoData(IPipelineSpecificationWithVersions pipelineSpecificationWithVersions, java.lang.String identifierClass, java.lang.String identifierHash, java.lang.String componentHash, java.lang.String documentVersion, java.lang.String authorityName, long recordTime, IOutputActivity activities) throws ManifoldCFException, ServiceInterruption
pipelineSpecificationWithVersions
- is the pipeline specification with already-fetched output versioning information.identifierClass
- is the name of the space in which the identifier hash should be interpreted.identifierHash
- is the hashed document identifier.componentHash
- is the hashed component identifier, if any.documentVersion
- is the document version.authorityName
- is the name of the authority associated with the document, if any.recordTime
- is the time at which the recording took place, in milliseconds since epoch.activities
- is an object providing a set of methods that the implementer can use to perform the operation.ManifoldCFException
ServiceInterruption
boolean documentIngest(IPipelineSpecificationWithVersions pipelineSpecificationWithVersions, java.lang.String identifierClass, java.lang.String identifierHash, java.lang.String componentHash, java.lang.String documentVersion, java.lang.String authorityName, RepositoryDocument data, long ingestTime, java.lang.String documentURI, IOutputActivity activities) throws ManifoldCFException, ServiceInterruption, java.io.IOException
pipelineSpecificationWithVersions
- is the pipeline specification with already-fetched output versioning information.identifierClass
- is the name of the space in which the identifier hash should be interpreted.identifierHash
- is the hashed document identifier.componentHash
- is the hashed component identifier, if any.documentVersion
- is the document version.authorityName
- is the name of the authority associated with the document, if any.data
- is the document data. The data is closed after ingestion is complete.ingestTime
- is the time at which the ingestion took place, in milliseconds since epoch.documentURI
- is the URI of the document, which will be used as the key of the document in the index.activities
- is an object providing a set of methods that the implementer can use to perform the operation.java.io.IOException
- only if data stream throws an IOException.ManifoldCFException
ServiceInterruption
void documentRemove(IPipelineConnections pipelineConnections, java.lang.String identifierClass, java.lang.String identifierHash, java.lang.String componentHash, IOutputRemoveActivity activities) throws ManifoldCFException, ServiceInterruption
pipelineConnections
- is the pipeline specification.identifierClass
- is the name of the space in which the identifier hash should be interpreted.identifierHash
- is the hash of the id of the document.componentHash
- is the hashed component identifier, if any.activities
- is the object to use to log the details of the ingestion attempt. May be null.ManifoldCFException
ServiceInterruption
void documentRemoveMultiple(IPipelineConnections pipelineConnections, java.lang.String[] identifierClasses, java.lang.String[] identifierHashes, java.lang.String componentHash, IOutputRemoveActivity activities) throws ManifoldCFException, ServiceInterruption
pipelineConnections
- is the pipeline specification.identifierClasses
- are the names of the spaces in which the identifier hash should be interpreted.identifierHashes
- are the hashes of the ids of the documents.componentHash
- is the hashed component identifier, if any.activities
- is the object to use to log the details of the ingestion attempt. May be null.ManifoldCFException
ServiceInterruption
void documentCheckMultiple(IPipelineSpecificationBasic pipelineSpecificationBasic, java.lang.String[] identifierClasses, java.lang.String[] identifierHashes, long checkTime) throws ManifoldCFException
pipelineSpecificationBasic
- is a pipeline specification.identifierClasses
- are the names of the spaces in which the identifier hashes should be interpreted.identifierHashes
- are the set of document identifier hashes.checkTime
- is the time at which the check took place, in milliseconds since epoch.ManifoldCFException
void documentCheck(IPipelineSpecificationBasic pipelineSpecificationBasic, java.lang.String identifierClass, java.lang.String identifierHash, long checkTime) throws ManifoldCFException
pipelineSpecificationBasic
- is a basic pipeline specification.identifierClass
- is the name of the space in which the identifier hash should be interpreted.identifierHash
- is the hashed document identifier.checkTime
- is the time at which the check took place, in milliseconds since epoch.ManifoldCFException
void documentDeleteMultiple(IPipelineConnections[] pipelineConnections, java.lang.String[] identifierClasses, java.lang.String[] identifierHashes, IOutputRemoveActivity activities) throws ManifoldCFException, ServiceInterruption
pipelineConnections
- are the pipeline specifications associated with the documents.identifierClasses
- are the names of the spaces in which the identifier hashes should be interpreted.identifierHashes
- is tha array of document identifier hashes if the documents.activities
- is the object to use to log the details of the ingestion attempt. May be null.ManifoldCFException
ServiceInterruption
void documentDeleteMultiple(IPipelineConnections pipelineConnections, java.lang.String[] identifierClasses, java.lang.String[] identifierHashes, IOutputRemoveActivity activities) throws ManifoldCFException, ServiceInterruption
pipelineConnections
- is the pipeline specification.identifierClasses
- are the names of the spaces in which the identifier hashes should be interpreted.identifierHashes
- is tha array of document identifier hashes if the documents.activities
- is the object to use to log the details of the ingestion attempt. May be null.ManifoldCFException
ServiceInterruption
void documentDelete(IPipelineConnections pipelineConnections, java.lang.String identifierClass, java.lang.String identifierHash, IOutputRemoveActivity activities) throws ManifoldCFException, ServiceInterruption
pipelineConnections
- is the pipeline specification.identifierClass
- is the name of the space in which the identifier hash should be interpreted.identifierHash
- is the hash of the id of the document.activities
- is the object to use to log the details of the ingestion attempt. May be null.ManifoldCFException
ServiceInterruption
void getPipelineDocumentIngestDataMultiple(IngestStatuses rval, IPipelineSpecificationBasic[] pipelineSpecificationBasics, java.lang.String[] identifierClasses, java.lang.String[] identifierHashes) throws ManifoldCFException
rval
- is a map of output key to document data, in no particular order, which will be loaded with all matching results.pipelineSpecificationBasics
- are the pipeline specifications corresponding to the identifier classes and hashes.identifierClasses
- are the names of the spaces in which the identifier hashes should be interpreted.identifierHashes
- is the array of document identifier hashes to look up.ManifoldCFException
void getPipelineDocumentIngestDataMultiple(IngestStatuses rval, IPipelineSpecificationBasic pipelineSpecificationBasic, java.lang.String[] identifierClasses, java.lang.String[] identifierHashes) throws ManifoldCFException
rval
- is a map of output key to document data, in no particular order, which will be loaded with all matching results.pipelineSpecificationBasic
- is the pipeline specification for all documents.identifierClasses
- are the names of the spaces in which the identifier hashes should be interpreted.identifierHashes
- is the array of document identifier hashes to look up.ManifoldCFException
void getPipelineDocumentIngestData(IngestStatuses rval, IPipelineSpecificationBasic pipelineSpecificationBasic, java.lang.String identifierClass, java.lang.String identifierHash) throws ManifoldCFException
rval
- is a map of output key to document data, in no particular order, which will be loaded with all matching results.pipelineSpecificationBasic
- is the pipeline specification for the document.identifierClass
- is the name of the space in which the identifier hash should be interpreted.identifierHash
- is the hash of the id of the document.ManifoldCFException
long[] getDocumentUpdateIntervalMultiple(IPipelineSpecificationBasic pipelineSpecificationBasic, java.lang.String[] identifierClasses, java.lang.String[] identifierHashes) throws ManifoldCFException
pipelineSpecificationBasic
- is the basic pipeline specification.identifierClasses
- are the names of the spaces in which the identifier hashes should be interpreted.identifierHashes
- is the hashes of the ids of the documents.ManifoldCFException
long getDocumentUpdateInterval(IPipelineSpecificationBasic pipelineSpecificationBasic, java.lang.String identifierClass, java.lang.String identifierHash) throws ManifoldCFException
pipelineSpecificationBasic
- is the basic pipeline specification.identifierClass
- is the name of the space in which the identifier hash should be interpreted.identifierHash
- is the hash of the id of the document.ManifoldCFException
void resetOutputConnection(IOutputConnection outputConnection) throws ManifoldCFException
outputConnection
- is the output connection associated with this action.ManifoldCFException
void removeOutputConnection(IOutputConnection outputConnection) throws ManifoldCFException
outputConnection
- is the output connection associated with this action.ManifoldCFException