Class WorkerThread.ProcessActivity

    • Field Detail

      • jobID

        protected final java.lang.Long jobID
      • processID

        protected final java.lang.String processID
      • connectionName

        protected final java.lang.String connectionName
      • previousDocuments

        protected final java.util.Map<java.lang.String,​QueuedDocument> previousDocuments
      • currentTime

        protected final long currentTime
      • expireInterval

        protected final java.lang.Long expireInterval
      • recrawlInterval

        protected final java.lang.Long recrawlInterval
      • maxInterval

        protected final java.lang.Long maxInterval
      • hopcountMode

        protected final int hopcountMode
      • legalLinkTypes

        protected final java.lang.String[] legalLinkTypes
      • lowerRescheduleBounds

        protected final java.util.Map<java.lang.String,​java.lang.Long> lowerRescheduleBounds
      • upperRescheduleBounds

        protected final java.util.Map<java.lang.String,​java.lang.Long> upperRescheduleBounds
      • lowerExpireBounds

        protected final java.util.Map<java.lang.String,​java.lang.Long> lowerExpireBounds
      • upperExpireBounds

        protected final java.util.Map<java.lang.String,​java.lang.Long> upperExpireBounds
      • originationTimes

        protected final java.util.Map<java.lang.String,​java.lang.Long> originationTimes
      • abortSet

        protected final java.util.Set<java.lang.String> abortSet
      • touchedSet

        protected final java.util.Set<java.lang.String> touchedSet
      • documentDeletedSet

        protected final java.util.Set<java.lang.String> documentDeletedSet
      • allComponentsSet

        protected final java.util.Set<java.lang.String> allComponentsSet
      • touchedComponentSet

        protected final java.util.Map<java.lang.String,​java.util.Set<java.lang.String>> touchedComponentSet
      • touchedPrimarySet

        protected final java.util.Set<java.lang.String> touchedPrimarySet
    • Method Detail

      • wasDocumentTouched

        public boolean wasDocumentTouched​(java.lang.String documentIdentifier)
        Check whether a document (and its version string) was touched or not.
      • wasDocumentComponentTouched

        public boolean wasDocumentComponentTouched​(java.lang.String documentIdentifier,
                                                   java.lang.String componentIdentifierHash)
        Check whether a document component was touched or not.
      • wasDocumentDeleted

        public boolean wasDocumentDeleted​(java.lang.String documentIdentifier)
        Check whether document was deleted or not.
      • wasDocumentAborted

        public boolean wasDocumentAborted​(java.lang.String documentIdentifier)
        Check whether a document was aborted or not.
      • checkDocumentNeedsReindexing

        public boolean checkDocumentNeedsReindexing​(java.lang.String documentIdentifier,
                                                    java.lang.String newVersionString)
                                             throws ManifoldCFException
        Check if a document needs to be reindexed, based on a computed version string. Call this method to determine whether reindexing is necessary. Pass in a newly-computed version string. This method will return "true" if the document needs to be re-indexed.
        Specified by:
        checkDocumentNeedsReindexing in interface IProcessActivity
        Parameters:
        documentIdentifier - is the document identifier.
        newVersionString - is the newly-computed version string.
        Returns:
        true if the document needs to be reindexed.
        Throws:
        ManifoldCFException
      • checkDocumentNeedsReindexing

        public boolean checkDocumentNeedsReindexing​(java.lang.String documentIdentifier,
                                                    java.lang.String componentIdentifier,
                                                    java.lang.String newVersionString)
                                             throws ManifoldCFException
        Check if a document needs to be reindexed, based on a computed version string. Call this method to determine whether reindexing is necessary. Pass in a newly-computed version string. This method will return "true" if the document needs to be re-indexed.
        Specified by:
        checkDocumentNeedsReindexing in interface IProcessActivity
        Parameters:
        documentIdentifier - is the document identifier.
        componentIdentifier - is the component document identifier, if any.
        newVersionString - is the newly-computed version string.
        Returns:
        true if the document needs to be reindexed.
        Throws:
        ManifoldCFException
      • addDocumentReference

        public void addDocumentReference​(java.lang.String localIdentifier,
                                         java.lang.String parentIdentifier,
                                         java.lang.String relationshipType,
                                         java.lang.String[] dataNames,
                                         java.lang.Object[][] dataValues,
                                         java.lang.Long originationTime,
                                         java.lang.String[] prereqEventNames)
                                  throws ManifoldCFException
        Add a document description to the current job's queue.
        Specified by:
        addDocumentReference in interface IProcessActivity
        Parameters:
        localIdentifier - is the local document identifier to add (for the connector that fetched the document).
        parentIdentifier - is the document identifier that is considered to be the "parent" of this identifier. May be null, if no hopcount filtering desired for this kind of relationship.
        relationshipType - is the string describing the kind of relationship described by this reference. This must be one of the strings returned by the IRepositoryConnector method "getRelationshipTypes()". May be null.
        dataNames - is the list of carry-down data from the parent to the child. May be null. Each name is limited to 255 characters!
        dataValues - are the values that correspond to the data names in the dataNames parameter. May be null only if dataNames is null. The type of each object must either be a String, or a CharacterInput.
        originationTime - is the time, in ms since epoch, that the document originated. Pass null if none or unknown.
        prereqEventNames - are the names of the prerequisite events which this document requires prior to processing. Pass null if none.
        Throws:
        ManifoldCFException
      • addDocumentReference

        public void addDocumentReference​(java.lang.String localIdentifier,
                                         java.lang.String parentIdentifier,
                                         java.lang.String relationshipType,
                                         java.lang.String[] dataNames,
                                         java.lang.Object[][] dataValues,
                                         java.lang.Long originationTime)
                                  throws ManifoldCFException
        Add a document description to the current job's queue.
        Specified by:
        addDocumentReference in interface IProcessActivity
        Parameters:
        localIdentifier - is the local document identifier to add (for the connector that fetched the document).
        parentIdentifier - is the document identifier that is considered to be the "parent" of this identifier. May be null, if no hopcount filtering desired for this kind of relationship.
        relationshipType - is the string describing the kind of relationship described by this reference. This must be one of the strings returned by the IRepositoryConnector method "getRelationshipTypes()". May be null.
        dataNames - is the list of carry-down data from the parent to the child. May be null. Each name is limited to 255 characters!
        dataValues - are the values that correspond to the data names in the dataNames parameter. May be null only if dataNames is null.
        originationTime - is the time, in ms since epoch, that the document originated. Pass null if none or unknown.
        Throws:
        ManifoldCFException
      • addDocumentReference

        public void addDocumentReference​(java.lang.String localIdentifier,
                                         java.lang.String parentIdentifier,
                                         java.lang.String relationshipType,
                                         java.lang.String[] dataNames,
                                         java.lang.Object[][] dataValues)
                                  throws ManifoldCFException
        Add a document description to the current job's queue.
        Specified by:
        addDocumentReference in interface IProcessActivity
        Parameters:
        localIdentifier - is the local document identifier to add (for the connector that fetched the document).
        parentIdentifier - is the document identifier that is considered to be the "parent" of this identifier. May be null, if no hopcount filtering desired for this kind of relationship.
        relationshipType - is the string describing the kind of relationship described by this reference. This must be one of the strings returned by the IRepositoryConnector method "getRelationshipTypes()". May be null.
        dataNames - is the list of carry-down data from the parent to the child. May be null. Each name is limited to 255 characters!
        dataValues - are the values that correspond to the data names in the dataNames parameter. May be null only if dataNames is null.
        Throws:
        ManifoldCFException
      • addDocumentReference

        public void addDocumentReference​(java.lang.String localIdentifier,
                                         java.lang.String parentIdentifier,
                                         java.lang.String relationshipType)
                                  throws ManifoldCFException
        Add a document description to the current job's queue.
        Specified by:
        addDocumentReference in interface IProcessActivity
        Parameters:
        localIdentifier - is the local document identifier to add (for the connector that fetched the document).
        parentIdentifier - is the document identifier that is considered to be the "parent" of this identifier. May be null, if no hopcount filtering desired for this kind of relationship.
        relationshipType - is the string describing the kind of relationship described by this reference. This must be one of the strings returned by the IRepositoryConnector method "getRelationshipTypes()". May be null.
        Throws:
        ManifoldCFException
      • addDocumentReference

        public void addDocumentReference​(java.lang.String localIdentifier)
                                  throws ManifoldCFException
        Add a document description to the current job's queue. This method is equivalent to addDocumentReference(localIdentifier,null,null).
        Specified by:
        addDocumentReference in interface IProcessActivity
        Parameters:
        localIdentifier - is the local document identifier to add (for the connector that fetched the document).
        Throws:
        ManifoldCFException
      • retrieveParentData

        public java.lang.String[] retrieveParentData​(java.lang.String localIdentifier,
                                                     java.lang.String dataName)
                                              throws ManifoldCFException
        Retrieve data passed from parents to a specified child document.
        Specified by:
        retrieveParentData in interface ICarrydownActivity
        Parameters:
        localIdentifier - is the document identifier of the document we want the recorded data for.
        dataName - is the name of the data items to retrieve.
        Returns:
        an array containing the unique data values passed from ALL parents. Note that these are in no particular order, and there will not be any duplicates.
        Throws:
        ManifoldCFException
      • retrieveParentDataAsFiles

        public CharacterInput[] retrieveParentDataAsFiles​(java.lang.String localIdentifier,
                                                          java.lang.String dataName)
                                                   throws ManifoldCFException
        Retrieve data passed from parents to a specified child document.
        Specified by:
        retrieveParentDataAsFiles in interface ICarrydownActivity
        Parameters:
        localIdentifier - is the document identifier of the document we want the recorded data for.
        dataName - is the name of the data items to retrieve.
        Returns:
        an array containing the unique data values passed from ALL parents. Note that these are in no particular order, and there will not be any duplicates.
        Throws:
        ManifoldCFException
      • recordDocument

        public void recordDocument​(java.lang.String documentIdentifier,
                                   java.lang.String version)
                            throws ManifoldCFException
        Record a document version, but don't ingest it.
        Specified by:
        recordDocument in interface IProcessActivity
        Parameters:
        documentIdentifier - is the document identifier.
        version - is the document version.
        Throws:
        ManifoldCFException
      • recordDocument

        public void recordDocument​(java.lang.String documentIdentifier,
                                   java.lang.String componentIdentifier,
                                   java.lang.String version)
                            throws ManifoldCFException
        Record a document version, WITHOUT reindexing it, or removing it. (Other documents with the same URL, however, will still be removed.) This is useful if the version string changes but the document contents are known not to have changed.
        Specified by:
        recordDocument in interface IProcessActivity
        Parameters:
        documentIdentifier - is the document identifier.
        componentIdentifier - is the component document identifier, if any.
        version - is the document version.
        Throws:
        ManifoldCFException
      • ingestDocumentWithException

        public void ingestDocumentWithException​(java.lang.String documentIdentifier,
                                                java.lang.String version,
                                                java.lang.String documentURI,
                                                RepositoryDocument data)
                                         throws ManifoldCFException,
                                                ServiceInterruption,
                                                java.io.IOException
        Ingest the current document.
        Specified by:
        ingestDocumentWithException in interface IProcessActivity
        Parameters:
        documentIdentifier - is the document's local identifier.
        version - is the version of the document, as reported by the getDocumentVersions() method of the corresponding repository connector.
        documentURI - is the URI to use to retrieve this document from the search interface (and is also the unique key in the index).
        data - is the document data. The data is closed after ingestion is complete.
        Throws:
        java.io.IOException - only when data stream reading fails.
        ManifoldCFException
        ServiceInterruption
      • ingestDocumentWithException

        public void ingestDocumentWithException​(java.lang.String documentIdentifier,
                                                java.lang.String componentIdentifier,
                                                java.lang.String version,
                                                java.lang.String documentURI,
                                                RepositoryDocument data)
                                         throws ManifoldCFException,
                                                ServiceInterruption,
                                                java.io.IOException
        Ingest the current document.
        Specified by:
        ingestDocumentWithException in interface IProcessActivity
        Parameters:
        documentIdentifier - is the document's identifier.
        componentIdentifier - is the component document identifier, if any.
        version - is the version of the document, as reported by the getDocumentVersions() method of the corresponding repository connector.
        documentURI - is the URI to use to retrieve this document from the search interface (and is also the unique key in the index).
        data - is the document data. The data is closed after ingestion is complete.
        Throws:
        java.io.IOException - only when data stream reading fails.
        ManifoldCFException
        ServiceInterruption
      • noDocument

        public void noDocument​(java.lang.String documentIdentifier,
                               java.lang.String version)
                        throws ManifoldCFException,
                               ServiceInterruption
        Remove the specified document from the search engine index, while keeping track of the version information for it (to reduce churn).
        Specified by:
        noDocument in interface IProcessActivity
        Parameters:
        documentIdentifier - is the document's local identifier.
        version - is the version string to be recorded for the document.
        Throws:
        ManifoldCFException
        ServiceInterruption
      • noDocument

        public void noDocument​(java.lang.String documentIdentifier,
                               java.lang.String componentIdentifier,
                               java.lang.String version)
                        throws ManifoldCFException,
                               ServiceInterruption
        Remove the specified document from the search engine index, and update the recorded version information for the document.
        Specified by:
        noDocument in interface IProcessActivity
        Parameters:
        documentIdentifier - is the document's local identifier.
        componentIdentifier - is the component document identifier, if any.
        version - is the version string to be recorded for the document.
        Throws:
        ManifoldCFException
        ServiceInterruption
      • removeDocument

        public void removeDocument​(java.lang.String documentIdentifier)
                            throws ManifoldCFException,
                                   ServiceInterruption
        Remove the specified document primary component permanently from the search engine index, and from the status table. Use this method when your document has components and now also has a primary document, but will not have a primary document again for the foreseeable future. This is a rare situation.
        Specified by:
        removeDocument in interface IProcessActivity
        Parameters:
        documentIdentifier - is the document's identifier.
        Throws:
        ManifoldCFException
        ServiceInterruption
      • retainDocument

        public void retainDocument​(java.lang.String documentIdentifier,
                                   java.lang.String componentIdentifier)
                            throws ManifoldCFException
        Retain existing document component. Use this method to signal that an already-existing document component does not need to be reindexed. The default behavior is to remove components that are not mentioned during processing.
        Specified by:
        retainDocument in interface IProcessActivity
        Parameters:
        documentIdentifier - is the document's identifier.
        componentIdentifier - is the component document identifier, which cannot be null.
        Throws:
        ManifoldCFException
      • retainAllComponentDocument

        public void retainAllComponentDocument​(java.lang.String documentIdentifier)
                                        throws ManifoldCFException
        Retain all existing document components of a primary document. Use this method to signal that no document components need to be reindexed. The default behavior is to remove components that are not mentioned during processing.
        Specified by:
        retainAllComponentDocument in interface IProcessActivity
        Parameters:
        documentIdentifier - is the document's identifier.
        Throws:
        ManifoldCFException
      • deleteDocument

        public void deleteDocument​(java.lang.String documentIdentifier)
                            throws ManifoldCFException
        Delete the specified document from the search engine index, and from the status table. This method does NOT keep track of version information for the document and thus can lead to "churn", whereby the same document is queued, processed, and removed on subsequent crawls. It is therefore preferable to use noDocument() instead, in any case where the same decision will need to be made over and over.
        Specified by:
        deleteDocument in interface IProcessActivity
        Parameters:
        documentIdentifier - is the document's identifier.
        Throws:
        ManifoldCFException
      • setDocumentScheduleBounds

        public void setDocumentScheduleBounds​(java.lang.String localIdentifier,
                                              java.lang.Long lowerRecrawlBoundTime,
                                              java.lang.Long upperRecrawlBoundTime,
                                              java.lang.Long lowerExpireBoundTime,
                                              java.lang.Long upperExpireBoundTime)
                                       throws ManifoldCFException
        Override the schedule for the next time a document is crawled. Calling this method allows you to set an upper recrawl bound, lower recrawl bound, upper expire bound, lower expire bound, or a combination of these, on a specific document. This method is only effective if the job is a continuous one, and if the identifier you pass in is being processed.
        Specified by:
        setDocumentScheduleBounds in interface IProcessActivity
        Parameters:
        localIdentifier - is the document's local identifier.
        lowerRecrawlBoundTime - is the time in ms since epoch that the reschedule time should not fall BELOW, or null if none.
        upperRecrawlBoundTime - is the time in ms since epoch that the reschedule time should not rise ABOVE, or null if none.
        lowerExpireBoundTime - is the time in ms since epoch that the expire time should not fall BELOW, or null if none.
        upperExpireBoundTime - is the time in ms since epoch that the expire time should not rise ABOVE, or null if none.
        Throws:
        ManifoldCFException
      • setDocumentOriginationTime

        public void setDocumentOriginationTime​(java.lang.String localIdentifier,
                                               java.lang.Long originationTime)
                                        throws ManifoldCFException
        Override a document's origination time. Use this method to signal the framework that a document's origination time is something other than the first time it was crawled.
        Specified by:
        setDocumentOriginationTime in interface IProcessActivity
        Parameters:
        localIdentifier - is the document's local identifier.
        originationTime - is the document's origination time, or null if unknown.
        Throws:
        ManifoldCFException
      • getDocumentRescheduleLowerBoundTime

        public java.lang.Long getDocumentRescheduleLowerBoundTime​(java.lang.String localIdentifier)
        Find a document's lower rescheduling time bound, if any
      • getDocumentRescheduleUpperBoundTime

        public java.lang.Long getDocumentRescheduleUpperBoundTime​(java.lang.String localIdentifier)
        Find a document's upper rescheduling time bound, if any
      • getDocumentExpirationLowerBoundTime

        public java.lang.Long getDocumentExpirationLowerBoundTime​(java.lang.String localIdentifier)
        Find a document's lower expiration time bound, if any
      • getDocumentExpirationUpperBoundTime

        public java.lang.Long getDocumentExpirationUpperBoundTime​(java.lang.String localIdentifier)
        Find a document's upper expiration time bound, if any
      • getDocumentOriginationTime

        public java.lang.Long getDocumentOriginationTime​(java.lang.String localIdentifier)
        Get a document's origination time
      • calculateDocumentRescheduleTime

        public java.lang.Long calculateDocumentRescheduleTime​(long currentTime,
                                                              long timeAmt,
                                                              java.lang.String localIdentifier)
      • calculateDocumentExpireTime

        public java.lang.Long calculateDocumentExpireTime​(long currentTime,
                                                          java.lang.String localIdentifier)
      • resetTimes

        public void resetTimes()
        Reset the recorded times
      • recordActivity

        public void recordActivity​(java.lang.Long startTime,
                                   java.lang.String activityType,
                                   java.lang.Long dataSize,
                                   java.lang.String entityIdentifier,
                                   java.lang.String resultCode,
                                   java.lang.String resultDescription,
                                   java.lang.String[] childIdentifiers)
                            throws ManifoldCFException
        Record time-stamped information about the activity of the connector.
        Specified by:
        recordActivity in interface IHistoryActivity
        Parameters:
        startTime - is either null or the time since the start of epoch in milliseconds (Jan 1, 1970). Every activity has an associated time; the startTime field records when the activity began. A null value indicates that the start time and the finishing time are the same.
        activityType - is a string which is fully interpretable only in the context of the connector involved, which is used to categorize what kind of activity is being recorded. For example, a web connector might record a "fetch document" activity. Cannot be null.
        dataSize - is the number of bytes of data involved in the activity, or null if not applicable.
        entityIdentifier - is a (possibly long) string which identifies the object involved in the history record. The interpretation of this field will differ from connector to connector. May be null.
        resultCode - contains a terse description of the result of the activity. The description is limited in size to 255 characters, and can be interpreted only in the context of the current connector. May be null.
        resultDescription - is a (possibly long) human-readable string which adds detail, if required, to the result described in the resultCode field. This field is not meant to be queried on. May be null.
        childIdentifiers - is a set of child entity identifiers associated with this activity. May be null.
        Throws:
        ManifoldCFException
      • checkJobStillActive

        public void checkJobStillActive()
                                 throws ManifoldCFException,
                                        ServiceInterruption
        Check whether current job is still active. This method is provided to allow an individual connector that needs to wait on some long-term condition to give up waiting due to the job itself being aborted. If the connector should abort, this method will raise a properly-formed ServiceInterruption, which if thrown to the caller, will signal that the current processing activity remains incomplete and must be retried when the job is resumed.
        Specified by:
        checkJobStillActive in interface IAbortActivity
        Throws:
        ManifoldCFException
        ServiceInterruption
      • beginEventSequence

        public boolean beginEventSequence​(java.lang.String eventName)
                                   throws ManifoldCFException
        Begin an event sequence. This method should be called by a connector when a sequencing event should enter the "pending" state. If the event is already in that state, this method will return false, otherwise true. The connector has the responsibility of appropriately managing sequencing given the response status.
        Specified by:
        beginEventSequence in interface IEventActivity
        Parameters:
        eventName - is the event name.
        Returns:
        false if the event is already in the "pending" state.
        Throws:
        ManifoldCFException
      • completeEventSequence

        public void completeEventSequence​(java.lang.String eventName)
                                   throws ManifoldCFException
        Complete an event sequence. This method should be called to signal that an event is no longer in the "pending" state. This can mean that the prerequisite processing is completed, but it can also mean that prerequisite processing was aborted or cannot be completed. Note well: This method should not be called unless the connector is CERTAIN that an event is in progress, and that the current thread has the sole right to complete it. Otherwise, race conditions can develop which would be difficult to diagnose.
        Specified by:
        completeEventSequence in interface IEventActivity
        Parameters:
        eventName - is the event name.
        Throws:
        ManifoldCFException
      • retryDocumentProcessing

        public void retryDocumentProcessing​(java.lang.String localIdentifier)
                                     throws ManifoldCFException
        Abort processing a document (for sequencing reasons). This method should be called in order to cause the specified document to be requeued for later processing. While this is similar in some respects to the semantics of a ServiceInterruption, it is applicable to only one document at a time, and also does not specify any delay period, since it is presumed that the reason for the requeue is because of sequencing issues synchronized around an underlying event.
        Specified by:
        retryDocumentProcessing in interface IEventActivity
        Parameters:
        localIdentifier - is the document identifier to requeue
        Throws:
        ManifoldCFException
      • checkDateIndexable

        public boolean checkDateIndexable​(java.util.Date date)
                                   throws ManifoldCFException,
                                          ServiceInterruption
        Detect if a date is indexable or not. This method is used by participating repository connectors to pre-filter the number of unusable documents that will be passed to this output connector.
        Specified by:
        checkDateIndexable in interface IFingerprintActivity
        Parameters:
        date - is the date of the document; may be null
        Returns:
        true if a document with that date is indexable by this connector.
        Throws:
        ManifoldCFException
        ServiceInterruption
      • createGlobalString

        public java.lang.String createGlobalString​(java.lang.String simpleString)
        Create a global string from a simple string.
        Specified by:
        createGlobalString in interface INamingActivity
        Parameters:
        simpleString - is the simple string.
        Returns:
        a global string.
      • createConnectionSpecificString

        public java.lang.String createConnectionSpecificString​(java.lang.String simpleString)
        Create a connection-specific string from a simple string.
        Specified by:
        createConnectionSpecificString in interface INamingActivity
        Parameters:
        simpleString - is the simple string.
        Returns:
        a connection-specific string.
      • createJobSpecificString

        public java.lang.String createJobSpecificString​(java.lang.String simpleString)
        Create a job-based string from a simple string.
        Specified by:
        createJobSpecificString in interface INamingActivity
        Parameters:
        simpleString - is the simple string.
        Returns:
        a job-specific string.
      • checkAllComponentsMultipleDispositions

        protected void checkAllComponentsMultipleDispositions​(java.lang.String documentIdentifier)
      • checkMultipleDispositions

        protected void checkMultipleDispositions​(java.lang.String documentIdentifier,
                                                 java.lang.String componentIdentifier,
                                                 java.lang.String componentIdentifierHash)
      • touchAllComponentsSet

        protected void touchAllComponentsSet​(java.lang.String documentIdentifier)
      • touchComponentSet

        protected void touchComponentSet​(java.lang.String documentIdentifier,
                                         java.lang.String componentIdentifierHash)
      • computePipelineSpecificationWithVersions

        protected IPipelineSpecificationWithVersions computePipelineSpecificationWithVersions​(java.lang.String documentIdentifierHash,
                                                                                              java.lang.String componentIdentifierHash,
                                                                                              java.lang.String documentIdentifier)