/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
using System;
using Document = Lucene.Net.Documents.Document;
using Fieldable = Lucene.Net.Documents.Fieldable;
using AlreadyClosedException = Lucene.Net.Store.AlreadyClosedException;
using Directory = Lucene.Net.Store.Directory;
using IndexInput = Lucene.Net.Store.IndexInput;
using IndexOutput = Lucene.Net.Store.IndexOutput;
using RAMOutputStream = Lucene.Net.Store.RAMOutputStream;
using Analyzer = Lucene.Net.Analysis.Analyzer;
using Token = Lucene.Net.Analysis.Token;
using TokenStream = Lucene.Net.Analysis.TokenStream;
using Similarity = Lucene.Net.Search.Similarity;
namespace Lucene.Net.Index
{
/// <summary> This class accepts multiple added documents and directly
/// writes a single segment file. It does this more
/// efficiently than creating a single segment per document
/// (with DocumentWriter) and doing standard merges on those
/// segments.
///
/// When a document is added, its stored fields (if any) and
/// term vectors (if any) are immediately written to the
/// Directory (ie these do not consume RAM). The freq/prox
/// postings are accumulated into a Postings hash table keyed
/// by term. Each entry in this hash table holds a separate
/// byte stream (allocated as incrementally growing slices
/// into large shared byte[] arrays) for freq and prox, that
/// contains the postings data for multiple documents. If
/// vectors are enabled, each unique term for each document
/// also allocates a PostingVector instance to similarly
/// track the offsets & positions byte stream.
///
/// Once the Postings hash is full (ie is consuming the
/// allowed RAM) or the number of added docs is large enough
/// (in the case we are flushing by doc count instead of RAM
/// usage), we create a real segment and flush it to disk and
/// reset the Postings hash.
///
/// In adding a document we first organize all of its fields
/// by field name. We then process field by field, and
/// record the Posting hash per-field. After each field we
/// flush its term vectors. When it's time to flush the full
/// segment we first sort the fields by name, and then go
/// field by field and sorts its postings.
///
///
/// Threads:
///
/// Multiple threads are allowed into addDocument at once.
/// There is an initial synchronized call to getThreadState
/// which allocates a ThreadState for this thread. The same
/// thread will get the same ThreadState over time (thread
/// affinity) so that if there are consistent patterns (for
/// example each thread is indexing a different content
/// source) then we make better use of RAM. Then
/// processDocument is called on that ThreadState without
/// synchronization (most of the "heavy lifting" is in this
/// call). Finally the synchronized "finishDocument" is
/// called to flush changes to the directory.
///
/// Each ThreadState instance has its own Posting hash. Once
/// we're using too much RAM, we flush all Posting hashes to
/// a segment by merging the docIDs in the posting lists for
/// the same term across multiple thread states (see
/// writeSegment and appendPostings).
///
/// When flush is called by IndexWriter, or, we flush
/// internally when autoCommit=false, we forcefully idle all
/// threads and flush only once they are all idle. This
/// means you can call flush with a given thread even while
/// other threads are actively adding/deleting documents.
///
///
/// Exceptions:
///
/// Because this class directly updates in-memory posting
/// lists, and flushes stored fields and term vectors
/// directly to files in the directory, there are certain
/// limited times when an exception can corrupt this state.
/// For example, a disk full while flushing stored fields
/// leaves this file in a corrupt state. Or, an OOM
/// exception while appending to the in-memory posting lists
/// can corrupt that posting list. We call such exceptions
/// "aborting exceptions". In these cases we must call
/// abort() to discard all docs added since the last flush.
///
/// All other exceptions ("non-aborting exceptions") can
/// still partially update the index structures. These
/// updates are consistent, but, they represent only a part
/// of the document seen up until the exception was hit.
/// When this happens, we immediately mark the document as
/// deleted so that the document is always atomically ("all
/// or none") added to the index.
/// </summary>
public sealed class DocumentsWriter
{
private void InitBlock()
{
threadStates = new ThreadState[0];
waitingThreadStates = new ThreadState[MAX_THREAD_STATE];
maxBufferedDeleteTerms = IndexWriter.DEFAULT_MAX_BUFFERED_DELETE_TERMS;
ramBufferSize = (long) (IndexWriter.DEFAULT_RAM_BUFFER_SIZE_MB * 1024 * 1024); // {{Aroush-2.3.1}} should 'ramBufferSize'
maxBufferedDocs = IndexWriter.DEFAULT_MAX_BUFFERED_DOCS;
norms = new BufferedNorms[0];
}
private IndexWriter writer;
private Directory directory;
private FieldInfos fieldInfos = new FieldInfos(); // All fields we've seen
private IndexOutput tvx, tvf, tvd; // To write term vectors
private FieldsWriter fieldsWriter; // To write stored fields
private System.String segment; // Current segment we are working on
private System.String docStoreSegment; // Current doc-store segment we are writing
private int docStoreOffset; // Current starting doc-store offset of current segment
private int nextDocID; // Next docID to be added
private int numDocsInRAM; // # docs buffered in RAM
private int numDocsInStore; // # docs written to doc stores
private int nextWriteDocID; // Next docID to be written
// Max # ThreadState instances; if there are more threads
// than this they share ThreadStates
private const int MAX_THREAD_STATE = 5;
private ThreadState[] threadStates;
private System.Collections.Hashtable threadBindings = new System.Collections.Hashtable();
private int numWaiting;
private ThreadState[] waitingThreadStates;
private int pauseThreads; // Non-zero when we need all threads to
// pause (eg to flush)
private bool flushPending; // True when a thread has decided to flush
private bool bufferIsFull; // True when it's time to write segment
private int abortCount; // Non-zero while abort is pending or running
private System.IO.TextWriter infoStream;
// This Hashmap buffers delete terms in ram before they
// are applied. The key is delete term; the value is
// number of buffered documents the term applies to.
private System.Collections.Hashtable bufferedDeleteTerms = new System.Collections.Hashtable();
private int numBufferedDeleteTerms = 0;
// Currently used only for deleting a doc on hitting an non-aborting exception
private System.Collections.IList bufferedDeleteDocIDs = new System.Collections.ArrayList();
// The max number of delete terms that can be buffered before
// they must be flushed to disk.
private int maxBufferedDeleteTerms;
// How much RAM we can use before flushing. This is 0 if
// we are flushing by doc count instead.
private long ramBufferSize;
// Flush @ this number of docs. If rarmBufferSize is
// non-zero we will flush by RAM usage instead.
private int maxBufferedDocs;
private bool closed;
// Coarse estimates used to measure RAM usage of buffered deletes
private static int OBJECT_HEADER_BYTES = 8;
private static int OBJECT_POINTER_BYTES = 4; // TODO: should be 8 on 64-bit platform
private static int BYTES_PER_CHAR = 2;
private static int BYTES_PER_INT = 4;
private BufferedNorms[] norms; // Holds norms until we flush
internal DocumentsWriter(Directory directory, IndexWriter writer)
{
InitBlock();
this.directory = directory;
this.writer = writer;
postingsFreeList = new Posting[0];
}
/// <summary>If non-null, various details of indexing are printed
/// here.
/// </summary>
internal void SetInfoStream(System.IO.TextWriter infoStream)
{
this.infoStream = infoStream;
}
/// <summary>Set how much RAM we can use before flushing. </summary>
internal void SetRAMBufferSizeMB(double mb)
{
if (mb == IndexWriter.DISABLE_AUTO_FLUSH)
{
ramBufferSize = IndexWriter.DISABLE_AUTO_FLUSH;
}
else
{
ramBufferSize = (long) (mb * 1024 * 1024);
}
}
internal double GetRAMBufferSizeMB()
{
if (ramBufferSize == IndexWriter.DISABLE_AUTO_FLUSH)
{
return ramBufferSize;
}
else
{
return ramBufferSize / 1024.0 / 1024.0;
}
}
/// <summary>Set max buffered docs, which means we will flush by
/// doc count instead of by RAM usage.
/// </summary>
internal void SetMaxBufferedDocs(int count)
{
maxBufferedDocs = count;
}
internal int GetMaxBufferedDocs()
{
return maxBufferedDocs;
}
/// <summary>Get current segment name we are writing. </summary>
internal System.String GetSegment()
{
return segment;
}
/// <summary>Returns how many docs are currently buffered in RAM. </summary>
internal int GetNumDocsInRAM()
{
return numDocsInRAM;
}
/// <summary>Returns the current doc store segment we are writing
/// to. This will be the same as segment when autoCommit
/// * is true.
/// </summary>
internal System.String GetDocStoreSegment()
{
return docStoreSegment;
}
/// <summary>Returns the doc offset into the shared doc store for
/// the current buffered docs.
/// </summary>
internal int GetDocStoreOffset()
{
return docStoreOffset;
}
/// <summary>Closes the current open doc stores an returns the doc
/// store segment name. This returns null if there are *
/// no buffered documents.
/// </summary>
internal System.String CloseDocStore()
{
System.Diagnostics.Debug.Assert(AllThreadsIdle());
System.Collections.IList flushedFiles = Files();
if (infoStream != null)
infoStream.WriteLine("\ncloseDocStore: " + flushedFiles.Count + " files to flush to segment " + docStoreSegment + " numDocs=" + numDocsInStore);
if (flushedFiles.Count > 0)
{
files = null;
if (tvx != null)
{
// At least one doc in this run had term vectors enabled
System.Diagnostics.Debug.Assert(docStoreSegment != null);
tvx.Close();
tvf.Close();
tvd.Close();
tvx = null;
}
if (fieldsWriter != null)
{
System.Diagnostics.Debug.Assert(docStoreSegment != null);
fieldsWriter.Close();
fieldsWriter = null;
}
System.String s = docStoreSegment;
docStoreSegment = null;
docStoreOffset = 0;
numDocsInStore = 0;
return s;
}
else
{
return null;
}
}
private System.Collections.IList files = null; // Cached list of files we've created
private System.Collections.IList abortedFiles = null; // List of files that were written before last abort()
internal System.Collections.IList AbortedFiles()
{
return abortedFiles;
}
/* Returns list of files in use by this instance,
* including any flushed segments. */
internal System.Collections.IList Files()
{
lock (this)
{
if (files != null)
return files;
files = new System.Collections.ArrayList();
// Stored fields:
if (fieldsWriter != null)
{
System.Diagnostics.Debug.Assert(docStoreSegment != null);
files.Add(docStoreSegment + "." + IndexFileNames.FIELDS_EXTENSION);
files.Add(docStoreSegment + "." + IndexFileNames.FIELDS_INDEX_EXTENSION);
}
// Vectors:
if (tvx != null)
{
System.Diagnostics.Debug.Assert(docStoreSegment != null);
files.Add(docStoreSegment + "." + IndexFileNames.VECTORS_INDEX_EXTENSION);
files.Add(docStoreSegment + "." + IndexFileNames.VECTORS_FIELDS_EXTENSION);
files.Add(docStoreSegment + "." + IndexFileNames.VECTORS_DOCUMENTS_EXTENSION);
}
return files;
}
}
internal void SetAborting()
{
lock (this)
{
abortCount++;
}
}
/// <summary>Called if we hit an exception when adding docs,
/// flushing, etc. This resets our state, discarding any
/// docs added since last flush. If ae is non-null, it
/// contains the root cause exception (which we re-throw
/// after we are done aborting).
/// </summary>
internal void Abort(AbortException ae)
{
lock (this)
{
// Anywhere that throws an AbortException must first
// mark aborting to make sure while the exception is
// unwinding the un-synchronized stack, no thread grabs
// the corrupt ThreadState that hit the aborting
// exception:
System.Diagnostics.Debug.Assert(ae == null || abortCount > 0);
try
{
if (infoStream != null)
infoStream.WriteLine("docWriter: now abort");
// Forcefully remove waiting ThreadStates from line
for (int i = 0; i < numWaiting; i++)
waitingThreadStates[i].isIdle = true;
numWaiting = 0;
// Wait for all other threads to finish with DocumentsWriter:
PauseAllThreads();
System.Diagnostics.Debug.Assert(0 == numWaiting);
try
{
bufferedDeleteTerms.Clear();
bufferedDeleteDocIDs.Clear();
numBufferedDeleteTerms = 0;
try
{
abortedFiles = Files();
}
catch (System.Exception)
{
abortedFiles = null;
}
docStoreSegment = null;
numDocsInStore = 0;
docStoreOffset = 0;
files = null;
// Clear vectors & fields from ThreadStates
for (int i = 0; i < threadStates.Length; i++)
{
ThreadState state = threadStates[i];
state.tvfLocal.Reset();
state.fdtLocal.Reset();
if (state.localFieldsWriter != null)
{
try
{
state.localFieldsWriter.Close();
}
catch (System.Exception)
{
}
state.localFieldsWriter = null;
}
}
// Reset vectors writer
if (tvx != null)
{
try
{
tvx.Close();
}
catch (System.Exception)
{
}
tvx = null;
}
if (tvd != null)
{
try
{
tvd.Close();
}
catch (System.Exception)
{
}
tvd = null;
}
if (tvf != null)
{
try
{
tvf.Close();
}
catch (System.Exception)
{
}
tvf = null;
}
// Reset fields writer
if (fieldsWriter != null)
{
try
{
fieldsWriter.Close();
}
catch (System.Exception)
{
}
fieldsWriter = null;
}
// Discard pending norms:
int numField = fieldInfos.Size();
for (int i = 0; i < numField; i++)
{
FieldInfo fi = fieldInfos.FieldInfo(i);
if (fi.isIndexed && !fi.omitNorms)
{
BufferedNorms n = norms[i];
if (n != null)
try
{
n.Reset();
}
catch (System.Exception)
{
}
}
}
// Reset all postings data
ResetPostingsData();
}
finally
{
ResumeAllThreads();
}
// If we have a root cause exception, re-throw it now:
if (ae != null)
{
System.Exception t = ae.InnerException;
if (t is System.IO.IOException)
throw (System.IO.IOException) t;
else if (t is System.SystemException)
throw (System.SystemException) t;
else if (t is System.ApplicationException)
throw (System.ApplicationException) t;
else
// Should not get here
System.Diagnostics.Debug.Assert(false, "unknown exception: " + t);
}
}
finally
{
if (ae != null)
abortCount--;
System.Threading.Monitor.PulseAll(this);
}
}
}
/// <summary>Reset after a flush </summary>
private void ResetPostingsData()
{
// All ThreadStates should be idle when we are called
System.Diagnostics.Debug.Assert(AllThreadsIdle());
threadBindings.Clear();
segment = null;
numDocsInRAM = 0;
nextDocID = 0;
nextWriteDocID = 0;
files = null;
BalanceRAM();
bufferIsFull = false;
flushPending = false;
for (int i = 0; i < threadStates.Length; i++)
{
threadStates[i].numThreads = 0;
threadStates[i].ResetPostings();
}
numBytesUsed = 0;
}
// Returns true if an abort is in progress
internal bool PauseAllThreads()
{
lock (this)
{
pauseThreads++;
while (!AllThreadsIdle())
{
try
{
System.Threading.Monitor.Wait(this);
}
catch (System.Threading.ThreadInterruptedException)
{
SupportClass.ThreadClass.Current().Interrupt();
}
}
return abortCount > 0;
}
}
internal void ResumeAllThreads()
{
lock (this)
{
pauseThreads--;
System.Diagnostics.Debug.Assert(pauseThreads >= 0);
if (0 == pauseThreads)
System.Threading.Monitor.PulseAll(this);
}
}
private bool AllThreadsIdle()
{
lock (this)
{
for (int i = 0; i < threadStates.Length; i++)
if (!threadStates[i].isIdle)
return false;
return true;
}
}
private bool hasNorms; // Whether any norms were seen since last flush
internal System.Collections.IList newFiles;
/// <summary>Flush all pending docs to a new segment </summary>
internal int Flush(bool closeDocStore)
{
lock (this)
{
System.Diagnostics.Debug.Assert(AllThreadsIdle());
if (segment == null)
// In case we are asked to flush an empty segment
segment = writer.NewSegmentName();
newFiles = new System.Collections.ArrayList();
docStoreOffset = numDocsInStore;
int docCount;
System.Diagnostics.Debug.Assert(numDocsInRAM > 0);
if (infoStream != null)
infoStream.WriteLine("\nflush postings as segment " + segment + " numDocs=" + numDocsInRAM);
bool success = false;
try
{
System.Collections.IEnumerator e;
if (closeDocStore)
{
System.Diagnostics.Debug.Assert(docStoreSegment != null);
System.Diagnostics.Debug.Assert(docStoreSegment.Equals(segment));
e = Files().GetEnumerator();
while (e.MoveNext())
newFiles.Add(e.Current);
CloseDocStore();
}
fieldInfos.Write(directory, segment + ".fnm");
docCount = numDocsInRAM;
e = WriteSegment().GetEnumerator();
while (e.MoveNext())
newFiles.Add(e.Current);
success = true;
}
finally
{
if (!success)
Abort(null);
}
return docCount;
}
}
/// <summary>Build compound file for the segment we just flushed </summary>
internal void CreateCompoundFile(System.String segment)
{
CompoundFileWriter cfsWriter = new CompoundFileWriter(directory, segment + "." + IndexFileNames.COMPOUND_FILE_EXTENSION);
int size = newFiles.Count;
for (int i = 0; i < size; i++)
cfsWriter.AddFile((System.String) newFiles[i]);
// Perform the merge
cfsWriter.Close();
}
/// <summary>Set flushPending if it is not already set and returns
/// whether it was set. This is used by IndexWriter to *
/// trigger a single flush even when multiple threads are
/// * trying to do so.
/// </summary>
internal bool SetFlushPending()
{
lock (this)
{
if (flushPending)
return false;
else
{
flushPending = true;
return true;
}
}
}
internal void ClearFlushPending()
{
lock (this)
{
flushPending = false;
}
}
/// <summary>Per-thread state. We keep a separate Posting hash and
/// other state for each thread and then merge postings *
/// hashes from all threads when writing the segment.
/// </summary>
sealed internal class ThreadState
{
private void InitBlock(DocumentsWriter enclosingInstance)
{
this.enclosingInstance = enclosingInstance;
allFieldDataArray = new FieldData[10];
postingsPool = new ByteBlockPool(enclosingInstance);
vectorsPool = new ByteBlockPool(enclosingInstance);
charPool = new CharBlockPool(enclosingInstance);
}
private DocumentsWriter enclosingInstance;
public DocumentsWriter Enclosing_Instance
{
get
{
return enclosingInstance;
}
}
internal Posting[] postingsFreeList; // Free Posting instances
internal int postingsFreeCount;
internal RAMOutputStream tvfLocal = new RAMOutputStream(); // Term vectors for one doc
internal RAMOutputStream fdtLocal = new RAMOutputStream(); // Stored fields for one doc
internal FieldsWriter localFieldsWriter; // Fields for one doc
internal long[] vectorFieldPointers;
internal int[] vectorFieldNumbers;
internal bool isIdle = true; // Whether we are in use
internal int numThreads = 1; // Number of threads that use this instance
internal int docID; // docID we are now working on
internal int numStoredFields; // How many stored fields in current doc
internal float docBoost; // Boost for current doc
internal FieldData[] fieldDataArray; // Fields touched by current doc
internal int numFieldData; // How many fields in current doc
internal int numVectorFields; // How many vector fields in current doc
internal FieldData[] allFieldDataArray; // All FieldData instances
internal int numAllFieldData;
internal FieldData[] fieldDataHash; // Hash FieldData instances by field name
internal int fieldDataHashMask;
internal System.String maxTermPrefix; // Non-null prefix of a too-large term if this
// doc has one
internal bool doFlushAfter;
public ThreadState(DocumentsWriter enclosingInstance)
{
InitBlock(enclosingInstance);
fieldDataArray = new FieldData[8];
fieldDataHash = new FieldData[16];
fieldDataHashMask = 15;
vectorFieldPointers = new long[10];
vectorFieldNumbers = new int[10];
postingsFreeList = new Posting[256];
postingsFreeCount = 0;
}
/// <summary>Clear the postings hash and return objects back to
/// shared pool
/// </summary>
public void ResetPostings()
{
fieldGen = 0;
maxPostingsVectors = 0;
doFlushAfter = false;
if (localFieldsWriter != null)
{
localFieldsWriter.Close();
localFieldsWriter = null;
}
postingsPool.Reset();
charPool.Reset();
Enclosing_Instance.RecyclePostings(postingsFreeList, postingsFreeCount);
postingsFreeCount = 0;
for (int i = 0; i < numAllFieldData; i++)
{
FieldData fp = allFieldDataArray[i];
fp.lastGen = - 1;
if (fp.numPostings > 0)
fp.ResetPostingArrays();
}
}
/// <summary>Move all per-document state that was accumulated in
/// the ThreadState into the "real" stores.
/// </summary>
public void WriteDocument()
{
// If we hit an exception while appending to the
// stored fields or term vectors files, we have to
// abort all documents since we last flushed because
// it means those files are possibly inconsistent.
try
{
Enclosing_Instance.numDocsInStore++;
// Append stored fields to the real FieldsWriter:
Enclosing_Instance.fieldsWriter.FlushDocument(numStoredFields, fdtLocal);
fdtLocal.Reset();
// Append term vectors to the real outputs:
if (Enclosing_Instance.tvx != null)
{
Enclosing_Instance.tvx.WriteLong(Enclosing_Instance.tvd.GetFilePointer());
Enclosing_Instance.tvd.WriteVInt(numVectorFields);
if (numVectorFields > 0)
{
for (int i = 0; i < numVectorFields; i++)
Enclosing_Instance.tvd.WriteVInt(vectorFieldNumbers[i]);
System.Diagnostics.Debug.Assert(0 == vectorFieldPointers [0]);
Enclosing_Instance.tvd.WriteVLong(Enclosing_Instance.tvf.GetFilePointer());
long lastPos = vectorFieldPointers[0];
for (int i = 1; i < numVectorFields; i++)
{
long pos = vectorFieldPointers[i];
Enclosing_Instance.tvd.WriteVLong(pos - lastPos);
lastPos = pos;
}
tvfLocal.WriteTo(Enclosing_Instance.tvf);
tvfLocal.Reset();
}
}
// Append norms for the fields we saw:
for (int i = 0; i < numFieldData; i++)
{
FieldData fp = fieldDataArray[i];
if (fp.doNorms)
{
BufferedNorms bn = Enclosing_Instance.norms[fp.fieldInfo.number];
System.Diagnostics.Debug.Assert(bn != null);
System.Diagnostics.Debug.Assert(bn.upto <= docID);
bn.Fill(docID);
float norm = fp.boost * Enclosing_Instance.writer.GetSimilarity().LengthNorm(fp.fieldInfo.name, fp.length);
bn.Add(norm);
}
}
}
catch (System.Exception t)
{
// Forcefully idle this threadstate -- its state will
// be reset by abort()
isIdle = true;
throw new AbortException(t, Enclosing_Instance);
}
if (Enclosing_Instance.bufferIsFull && !Enclosing_Instance.flushPending)
{
Enclosing_Instance.flushPending = true;
doFlushAfter = true;
}
}
internal int fieldGen;
/// <summary>Initializes shared state for this new document </summary>
internal void Init(Document doc, int docID)
{
System.Diagnostics.Debug.Assert(!isIdle);
this.docID = docID;
docBoost = doc.GetBoost();
numStoredFields = 0;
numFieldData = 0;
numVectorFields = 0;
maxTermPrefix = null;
System.Diagnostics.Debug.Assert(0 == fdtLocal.Length());
System.Diagnostics.Debug.Assert(0 == fdtLocal.GetFilePointer());
System.Diagnostics.Debug.Assert(0 == tvfLocal.Length());
System.Diagnostics.Debug.Assert(0 == tvfLocal.GetFilePointer());
int thisFieldGen = fieldGen++;
System.Collections.IList docFields = doc.GetFields();
int numDocFields = docFields.Count;
bool docHasVectors = false;
// Absorb any new fields first seen in this document.
// Also absorb any changes to fields we had already
// seen before (eg suddenly turning on norms or
// vectors, etc.):
for (int i = 0; i < numDocFields; i++)
{
Fieldable field = (Fieldable) docFields[i];
FieldInfo fi = Enclosing_Instance.fieldInfos.Add(field.Name(), field.IsIndexed(), field.IsTermVectorStored(), field.IsStorePositionWithTermVector(), field.IsStoreOffsetWithTermVector(), field.GetOmitNorms(), false);
if (fi.isIndexed && !fi.omitNorms)
{
// Maybe grow our buffered norms
if (Enclosing_Instance.norms.Length <= fi.number)
{
int newSize = (int) ((1 + fi.number) * 1.25);
BufferedNorms[] newNorms = new BufferedNorms[newSize];
Array.Copy(Enclosing_Instance.norms, 0, newNorms, 0, Enclosing_Instance.norms.Length);
Enclosing_Instance.norms = newNorms;
}
if (Enclosing_Instance.norms[fi.number] == null)
Enclosing_Instance.norms[fi.number] = new BufferedNorms();
Enclosing_Instance.hasNorms = true;
}
// Make sure we have a FieldData allocated
int hashPos = fi.name.GetHashCode() & fieldDataHashMask;
FieldData fp = fieldDataHash[hashPos];
while (fp != null && !fp.fieldInfo.name.Equals(fi.name))
fp = fp.next;
if (fp == null)
{
fp = new FieldData(this, fi);
fp.next = fieldDataHash[hashPos];
fieldDataHash[hashPos] = fp;
if (numAllFieldData == allFieldDataArray.Length)
{
int newSize = (int) (allFieldDataArray.Length * 1.5);
int newHashSize = fieldDataHash.Length * 2;
FieldData[] newArray = new FieldData[newSize];
FieldData[] newHashArray = new FieldData[newHashSize];
Array.Copy(allFieldDataArray, 0, newArray, 0, numAllFieldData);
// Rehash
fieldDataHashMask = newSize - 1;
for (int j = 0; j < fieldDataHash.Length; j++)
{
FieldData fp0 = fieldDataHash[j];
while (fp0 != null)
{
hashPos = fp0.fieldInfo.name.GetHashCode() & fieldDataHashMask;
FieldData nextFP0 = fp0.next;
fp0.next = newHashArray[hashPos];
newHashArray[hashPos] = fp0;
fp0 = nextFP0;
}
}
allFieldDataArray = newArray;
fieldDataHash = newHashArray;
}
allFieldDataArray[numAllFieldData++] = fp;
}
else
{
System.Diagnostics.Debug.Assert(fp.fieldInfo == fi);
}
if (thisFieldGen != fp.lastGen)
{
// First time we're seeing this field for this doc
fp.lastGen = thisFieldGen;
fp.fieldCount = 0;
fp.doVectors = fp.doVectorPositions = fp.doVectorOffsets = false;
fp.doNorms = fi.isIndexed && !fi.omitNorms;
if (numFieldData == fieldDataArray.Length)
{
int newSize = fieldDataArray.Length * 2;
FieldData[] newArray = new FieldData[newSize];
Array.Copy(fieldDataArray, 0, newArray, 0, numFieldData);
fieldDataArray = newArray;
}
fieldDataArray[numFieldData++] = fp;
}
if (field.IsTermVectorStored())
{
if (!fp.doVectors && numVectorFields++ == vectorFieldPointers.Length)
{
int newSize = (int) (numVectorFields * 1.5);
vectorFieldPointers = new long[newSize];
vectorFieldNumbers = new int[newSize];
}
fp.doVectors = true;
docHasVectors = true;
fp.doVectorPositions |= field.IsStorePositionWithTermVector();
fp.doVectorOffsets |= field.IsStoreOffsetWithTermVector();
}
if (fp.fieldCount == fp.docFields.Length)
{
Fieldable[] newArray = new Fieldable[fp.docFields.Length * 2];
Array.Copy(fp.docFields, 0, newArray, 0, fp.docFields.Length);
fp.docFields = newArray;
}
// Lazily allocate arrays for postings:
if (field.IsIndexed() && fp.postingsHash == null)
fp.InitPostingArrays();
fp.docFields[fp.fieldCount++] = field;
}
// Maybe init the local & global fieldsWriter
if (localFieldsWriter == null)
{
if (Enclosing_Instance.fieldsWriter == null)
{
System.Diagnostics.Debug.Assert(Enclosing_Instance.docStoreSegment == null);
System.Diagnostics.Debug.Assert(Enclosing_Instance.segment != null);
Enclosing_Instance.docStoreSegment = Enclosing_Instance.segment;
// If we hit an exception while init'ing the
// fieldsWriter, we must abort this segment
// because those files will be in an unknown
// state:
try
{
Enclosing_Instance.fieldsWriter = new FieldsWriter(Enclosing_Instance.directory, Enclosing_Instance.docStoreSegment, Enclosing_Instance.fieldInfos);
}
catch (System.Exception t)
{
throw new AbortException(t, Enclosing_Instance);
}
Enclosing_Instance.files = null;
}
localFieldsWriter = new FieldsWriter(null, fdtLocal, Enclosing_Instance.fieldInfos);
}
// First time we see a doc that has field(s) with
// stored vectors, we init our tvx writer
if (docHasVectors)
{
if (Enclosing_Instance.tvx == null)
{
System.Diagnostics.Debug.Assert(Enclosing_Instance.docStoreSegment != null);
// If we hit an exception while init'ing the term
// vector output files, we must abort this segment
// because those files will be in an unknown
// state:
try
{
Enclosing_Instance.tvx = Enclosing_Instance.directory.CreateOutput(Enclosing_Instance.docStoreSegment + "." + IndexFileNames.VECTORS_INDEX_EXTENSION);
Enclosing_Instance.tvx.WriteInt(TermVectorsReader.FORMAT_VERSION);
Enclosing_Instance.tvd = Enclosing_Instance.directory.CreateOutput(Enclosing_Instance.docStoreSegment + "." + IndexFileNames.VECTORS_DOCUMENTS_EXTENSION);
Enclosing_Instance.tvd.WriteInt(TermVectorsReader.FORMAT_VERSION);
Enclosing_Instance.tvf = Enclosing_Instance.directory.CreateOutput(Enclosing_Instance.docStoreSegment + "." + IndexFileNames.VECTORS_FIELDS_EXTENSION);
Enclosing_Instance.tvf.WriteInt(TermVectorsReader.FORMAT_VERSION);
// We must "catch up" for all docs before us
// that had no vectors:
for (int i = 0; i < Enclosing_Instance.numDocsInStore; i++)
{
Enclosing_Instance.tvx.WriteLong(Enclosing_Instance.tvd.GetFilePointer());
Enclosing_Instance.tvd.WriteVInt(0);
}
}
catch (System.Exception t)
{
throw new AbortException(t, Enclosing_Instance);
}
Enclosing_Instance.files = null;
}
numVectorFields = 0;
}
}
/// <summary>Do in-place sort of Posting array </summary>
internal void DoPostingSort(Posting[] postings, int numPosting)
{
QuickSort(postings, 0, numPosting - 1);
}
internal void QuickSort(Posting[] postings, int lo, int hi)
{
if (lo >= hi)
return ;
int mid = SupportClass.Number.URShift((lo + hi), 1);
if (ComparePostings(postings[lo], postings[mid]) > 0)
{
Posting tmp = postings[lo];
postings[lo] = postings[mid];
postings[mid] = tmp;
}
if (ComparePostings(postings[mid], postings[hi]) > 0)
{
Posting tmp = postings[mid];
postings[mid] = postings[hi];
postings[hi] = tmp;
if (ComparePostings(postings[lo], postings[mid]) > 0)
{
Posting tmp2 = postings[lo];
postings[lo] = postings[mid];
postings[mid] = tmp2;
}
}
int left = lo + 1;
int right = hi - 1;
if (left >= right)
return ;
Posting partition = postings[mid];
for (; ; )
{
while (ComparePostings(postings[right], partition) > 0)
--right;
while (left < right && ComparePostings(postings[left], partition) <= 0)
++left;
if (left < right)
{
Posting tmp = postings[left];
postings[left] = postings[right];
postings[right] = tmp;
--right;
}
else
{
break;
}
}
QuickSort(postings, lo, left);
QuickSort(postings, left + 1, hi);
}
/// <summary>Do in-place sort of PostingVector array </summary>
internal void DoVectorSort(PostingVector[] postings, int numPosting)
{
QuickSort(postings, 0, numPosting - 1);
}
internal void QuickSort(PostingVector[] postings, int lo, int hi)
{
if (lo >= hi)
return ;
int mid = SupportClass.Number.URShift((lo + hi), 1);
if (ComparePostings(postings[lo].p, postings[mid].p) > 0)
{
PostingVector tmp = postings[lo];
postings[lo] = postings[mid];
postings[mid] = tmp;
}
if (ComparePostings(postings[mid].p, postings[hi].p) > 0)
{
PostingVector tmp = postings[mid];
postings[mid] = postings[hi];
postings[hi] = tmp;
if (ComparePostings(postings[lo].p, postings[mid].p) > 0)
{
PostingVector tmp2 = postings[lo];
postings[lo] = postings[mid];
postings[mid] = tmp2;
}
}
int left = lo + 1;
int right = hi - 1;
if (left >= right)
return ;
PostingVector partition = postings[mid];
for (; ; )
{
while (ComparePostings(postings[right].p, partition.p) > 0)
--right;
while (left < right && ComparePostings(postings[left].p, partition.p) <= 0)
++left;
if (left < right)
{
PostingVector tmp = postings[left];
postings[left] = postings[right];
postings[right] = tmp;
--right;
}
else
{
break;
}
}
QuickSort(postings, lo, left);
QuickSort(postings, left + 1, hi);
}
/// <summary>If there are fields we've seen but did not see again
/// in the last run, then free them up. Also reduce
/// postings hash size.
/// </summary>
internal void TrimFields()
{
int upto = 0;
for (int i = 0; i < numAllFieldData; i++)
{
FieldData fp = allFieldDataArray[i];
if (fp.lastGen == - 1)
{
// This field was not seen since the previous
// flush, so, free up its resources now
// Unhash
int hashPos = fp.fieldInfo.name.GetHashCode() & fieldDataHashMask;
FieldData last = null;
FieldData fp0 = fieldDataHash[hashPos];
while (fp0 != fp)
{
last = fp0;
fp0 = fp0.next;
}
System.Diagnostics.Debug.Assert(fp0 != null);
if (last == null)
fieldDataHash[hashPos] = fp.next;
else
last.next = fp.next;
if (Enclosing_Instance.infoStream != null)
Enclosing_Instance.infoStream.WriteLine(" remove field=" + fp.fieldInfo.name);
}
else
{
// Reset
fp.lastGen = - 1;
allFieldDataArray[upto++] = fp;
if (fp.numPostings > 0 && ((float) fp.numPostings) / fp.postingsHashSize < 0.2)
{
int hashSize = fp.postingsHashSize;
// Reduce hash so it's between 25-50% full
while (fp.numPostings < (hashSize >> 1) && hashSize >= 2)
hashSize >>= 1;
hashSize <<= 1;
if (hashSize != fp.postingsHash.Length)
fp.RehashPostings(hashSize);
}
}
}
// If we didn't see any norms for this field since
// last flush, free it
for (int i = 0; i < Enclosing_Instance.norms.Length; i++)
{
BufferedNorms n = Enclosing_Instance.norms[i];
if (n != null && n.upto == 0)
Enclosing_Instance.norms[i] = null;
}
numAllFieldData = upto;
// Also pare back PostingsVectors if it's excessively
// large
if (maxPostingsVectors * 1.5 < postingsVectors.Length)
{
int newSize;
if (0 == maxPostingsVectors)
newSize = 1;
else
{
newSize = (int) (1.5 * maxPostingsVectors);
}
PostingVector[] newArray = new PostingVector[newSize];
Array.Copy(postingsVectors, 0, newArray, 0, newSize);
postingsVectors = newArray;
}
}
/// <summary>Tokenizes the fields of a document into Postings </summary>
internal void ProcessDocument(Analyzer analyzer)
{
int numFields = numFieldData;
System.Diagnostics.Debug.Assert(0 == fdtLocal.Length());
if (Enclosing_Instance.tvx != null)
// If we are writing vectors then we must visit
// fields in sorted order so they are written in
// sorted order. TODO: we actually only need to
// sort the subset of fields that have vectors
// enabled; we could save [small amount of] CPU
// here.
System.Array.Sort(fieldDataArray, 0, numFields - 0);
// We process the document one field at a time
for (int i = 0; i < numFields; i++)
fieldDataArray[i].ProcessField(analyzer);
if (maxTermPrefix != null && Enclosing_Instance.infoStream != null)
Enclosing_Instance.infoStream.WriteLine("WARNING: document contains at least one immense term (longer than the max length " + Lucene.Net.Index.DocumentsWriter.MAX_TERM_LENGTH + "), all of which were skipped. Please correct the analyzer to not produce such terms. The prefix of the first immense term is: '" + maxTermPrefix + "...'");
if (Enclosing_Instance.ramBufferSize != IndexWriter.DISABLE_AUTO_FLUSH && Enclosing_Instance.numBytesUsed > 0.95 * Enclosing_Instance.ramBufferSize)
Enclosing_Instance.BalanceRAM();
}
internal ByteBlockPool postingsPool;
internal ByteBlockPool vectorsPool;
internal CharBlockPool charPool;
// Current posting we are working on
internal Posting p;
internal PostingVector vector;
// USE ONLY FOR DEBUGGING!
/*
public String getPostingText() {
char[] text = charPool.buffers[p.textStart >> CHAR_BLOCK_SHIFT];
int upto = p.textStart & CHAR_BLOCK_MASK;
while(text[upto] != 0xffff)
upto++;
return new String(text, p.textStart, upto-(p.textStart & BYTE_BLOCK_MASK));
}
*/
/// <summary>Test whether the text for current Posting p equals
/// current tokenText.
/// </summary>
internal bool PostingEquals(char[] tokenText, int tokenTextLen)
{
char[] text = charPool.buffers[p.textStart >> Lucene.Net.Index.DocumentsWriter.CHAR_BLOCK_SHIFT];
System.Diagnostics.Debug.Assert(text != null);
int pos = p.textStart & Lucene.Net.Index.DocumentsWriter.CHAR_BLOCK_MASK;
int tokenPos = 0;
for (; tokenPos < tokenTextLen; pos++, tokenPos++)
if (tokenText[tokenPos] != text[pos])
return false;
return 0xffff == text[pos];
}
/// <summary>Compares term text for two Posting instance and
/// returns -1 if p1 < p2; 1 if p1 > p2; else 0.
/// </summary>
internal int ComparePostings(Posting p1, Posting p2)
{
char[] text1 = charPool.buffers[p1.textStart >> Lucene.Net.Index.DocumentsWriter.CHAR_BLOCK_SHIFT];
int pos1 = p1.textStart & Lucene.Net.Index.DocumentsWriter.CHAR_BLOCK_MASK;
char[] text2 = charPool.buffers[p2.textStart >> Lucene.Net.Index.DocumentsWriter.CHAR_BLOCK_SHIFT];
int pos2 = p2.textStart & Lucene.Net.Index.DocumentsWriter.CHAR_BLOCK_MASK;
while (true)
{
char c1 = text1[pos1++];
char c2 = text2[pos2++];
if (c1 < c2)
if (0xffff == c2)
return 1;
else
return - 1;
else if (c2 < c1)
if (0xffff == c1)
return - 1;
else
return 1;
else if (0xffff == c1)
return 0;
}
}
/// <summary>Write vInt into freq stream of current Posting </summary>
public void WriteFreqVInt(int i)
{
while ((i & ~ 0x7F) != 0)
{
WriteFreqByte((byte) ((i & 0x7f) | 0x80));
i = SupportClass.Number.URShift(i, 7);
}
WriteFreqByte((byte) i);
}
/// <summary>Write vInt into prox stream of current Posting </summary>
public void WriteProxVInt(int i)
{
while ((i & ~ 0x7F) != 0)
{
WriteProxByte((byte) ((i & 0x7f) | 0x80));
i = SupportClass.Number.URShift(i, 7);
}
WriteProxByte((byte) i);
}
/// <summary>Write byte into freq stream of current Posting </summary>
internal byte[] freq;
internal int freqUpto;
public void WriteFreqByte(byte b)
{
System.Diagnostics.Debug.Assert(freq != null);
if (freq[freqUpto] != 0)
{
freqUpto = postingsPool.AllocSlice(freq, freqUpto);
freq = postingsPool.buffer;
p.freqUpto = postingsPool.byteOffset;
}
freq[freqUpto++] = b;
}
/// <summary>Write byte into prox stream of current Posting </summary>
internal byte[] prox;
internal int proxUpto;
public void WriteProxByte(byte b)
{
System.Diagnostics.Debug.Assert(prox != null);
if (prox[proxUpto] != 0)
{
proxUpto = postingsPool.AllocSlice(prox, proxUpto);
prox = postingsPool.buffer;
p.proxUpto = postingsPool.byteOffset;
System.Diagnostics.Debug.Assert(prox != null);
}
prox[proxUpto++] = b;
System.Diagnostics.Debug.Assert(proxUpto != prox.Length);
}
/// <summary>Currently only used to copy a payload into the prox
/// stream.
/// </summary>
public void WriteProxBytes(byte[] b, int offset, int len)
{
int offsetEnd = offset + len;
while (offset < offsetEnd)
{
if (prox[proxUpto] != 0)
{
// End marker
proxUpto = postingsPool.AllocSlice(prox, proxUpto);
prox = postingsPool.buffer;
p.proxUpto = postingsPool.byteOffset;
}
prox[proxUpto++] = b[offset++];
System.Diagnostics.Debug.Assert(proxUpto != prox.Length);
}
}
/// <summary>Write vInt into offsets stream of current
/// PostingVector
/// </summary>
public void WriteOffsetVInt(int i)
{
while ((i & ~ 0x7F) != 0)
{
WriteOffsetByte((byte) ((i & 0x7f) | 0x80));
i = SupportClass.Number.URShift(i, 7);
}
WriteOffsetByte((byte) i);
}
internal byte[] offsets;
internal int offsetUpto;
/// <summary>Write byte into offsets stream of current
/// PostingVector
/// </summary>
public void WriteOffsetByte(byte b)
{
System.Diagnostics.Debug.Assert(offsets != null);
if (offsets[offsetUpto] != 0)
{
offsetUpto = vectorsPool.AllocSlice(offsets, offsetUpto);
offsets = vectorsPool.buffer;
vector.offsetUpto = vectorsPool.byteOffset;
}
offsets[offsetUpto++] = b;
}
/// <summary>Write vInt into pos stream of current
/// PostingVector
/// </summary>
public void WritePosVInt(int i)
{
while ((i & ~ 0x7F) != 0)
{
WritePosByte((byte) ((i & 0x7f) | 0x80));
i = SupportClass.Number.URShift(i, 7);
}
WritePosByte((byte) i);
}
internal byte[] pos;
internal int posUpto;
/// <summary>Write byte into pos stream of current
/// PostingVector
/// </summary>
public void WritePosByte(byte b)
{
System.Diagnostics.Debug.Assert(pos != null);
if (pos[posUpto] != 0)
{
posUpto = vectorsPool.AllocSlice(pos, posUpto);
pos = vectorsPool.buffer;
vector.posUpto = vectorsPool.byteOffset;
}
pos[posUpto++] = b;
}
internal PostingVector[] postingsVectors = new PostingVector[1];
internal int maxPostingsVectors;
// Used to read a string value for a field
internal ReusableStringReader stringReader = new ReusableStringReader();
/// <summary>Holds data associated with a single field, including
/// the Postings hash. A document may have many *
/// occurrences for a given field name; we gather all *
/// such occurrences here (in docFields) so that we can
/// * process the entire field at once.
/// </summary>
sealed internal class FieldData : System.IComparable
{
private void InitBlock(ThreadState enclosingInstance)
{
this.enclosingInstance = enclosingInstance;
}
private ThreadState enclosingInstance;
public ThreadState Enclosing_Instance
{
get
{
return enclosingInstance;
}
}
internal ThreadState threadState;
internal FieldInfo fieldInfo;
internal int fieldCount;
internal Fieldable[] docFields = new Fieldable[1];
internal int lastGen = - 1;
internal FieldData next;
internal bool doNorms;
internal bool doVectors;
internal bool doVectorPositions;
internal bool doVectorOffsets;
internal bool postingsCompacted;
internal int numPostings;
internal Posting[] postingsHash;
internal int postingsHashSize;
internal int postingsHashHalfSize;
internal int postingsHashMask;
internal int position;
internal int length;
internal int offset;
internal float boost;
internal int postingsVectorsUpto;
public FieldData(ThreadState enclosingInstance, FieldInfo fieldInfo)
{
InitBlock(enclosingInstance);
this.fieldInfo = fieldInfo;
threadState = Enclosing_Instance;
}
internal void ResetPostingArrays()
{
if (!postingsCompacted)
CompactPostings();
Enclosing_Instance.Enclosing_Instance.RecyclePostings(this.postingsHash, numPostings);
Array.Clear(postingsHash, 0, postingsHash.Length);
postingsCompacted = false;
numPostings = 0;
}
internal void InitPostingArrays()
{
// Target hash fill factor of <= 50%
// NOTE: must be a power of two for hash collision
// strategy to work correctly
postingsHashSize = 4;
postingsHashHalfSize = 2;
postingsHashMask = postingsHashSize - 1;
postingsHash = new Posting[postingsHashSize];
}
/// <summary>So Arrays.sort can sort us. </summary>
public int CompareTo(System.Object o)
{
return String.CompareOrdinal(fieldInfo.name, ((FieldData) o).fieldInfo.name);
}
private void CompactPostings()
{
int upto = 0;
for (int i = 0; i < postingsHashSize; i++)
if (postingsHash[i] != null)
postingsHash[upto++] = postingsHash[i];
System.Diagnostics.Debug.Assert(upto == numPostings);
postingsCompacted = true;
}
/// <summary>Collapse the hash table & sort in-place. </summary>
public Posting[] SortPostings()
{
CompactPostings();
Enclosing_Instance.DoPostingSort(postingsHash, numPostings);
return postingsHash;
}
/// <summary>Process all occurrences of one field in the document. </summary>
public void ProcessField(Analyzer analyzer)
{
length = 0;
position = 0;
offset = 0;
boost = Enclosing_Instance.docBoost;
int maxFieldLength = Enclosing_Instance.Enclosing_Instance.writer.GetMaxFieldLength();
int limit = fieldCount;
Fieldable[] docFieldsFinal = docFields;
bool doWriteVectors = true;
// Walk through all occurrences in this doc for this
// field:
try
{
for (int j = 0; j < limit; j++)
{
Fieldable field = docFieldsFinal[j];
if (field.IsIndexed())
InvertField(field, analyzer, maxFieldLength);
if (field.IsStored())
{
Enclosing_Instance.numStoredFields++;
bool success = false;
try
{
Enclosing_Instance.localFieldsWriter.WriteField(fieldInfo, field);
success = true;
}
finally
{
// If we hit an exception inside
// localFieldsWriter.writeField, the
// contents of fdtLocal can be corrupt, so
// we must discard all stored fields for
// this document:
if (!success)
Enclosing_Instance.fdtLocal.Reset();
}
}
docFieldsFinal[j] = null;
}
}
catch (AbortException ae)
{
doWriteVectors = false;
throw ae;
}
finally
{
if (postingsVectorsUpto > 0)
{
try
{
if (doWriteVectors)
{
// Add term vectors for this field
bool success = false;
try
{
WriteVectors(fieldInfo);
success = true;
}
finally
{
if (!success)
{
// If we hit an exception inside
// writeVectors, the contents of tvfLocal
// can be corrupt, so we must discard all
// term vectors for this document:
Enclosing_Instance.numVectorFields = 0;
Enclosing_Instance.tvfLocal.Reset();
}
}
}
}
finally
{
if (postingsVectorsUpto > Enclosing_Instance.maxPostingsVectors)
Enclosing_Instance.maxPostingsVectors = postingsVectorsUpto;
postingsVectorsUpto = 0;
Enclosing_Instance.vectorsPool.Reset();
}
}
}
}
internal int offsetEnd;
internal Token localToken = new Token();
/* Invert one occurrence of one field in the document */
public void InvertField(Fieldable field, Analyzer analyzer, int maxFieldLength)
{
if (length > 0)
position += analyzer.GetPositionIncrementGap(fieldInfo.name);
if (!field.IsTokenized())
{
// un-tokenized field
System.String stringValue = field.StringValue();
int valueLength = stringValue.Length;
Token token = localToken;
token.Clear();
char[] termBuffer = token.TermBuffer();
if (termBuffer.Length < valueLength)
termBuffer = token.ResizeTermBuffer(valueLength);
DocumentsWriter.GetCharsFromString(stringValue, 0, valueLength, termBuffer, 0);
token.SetTermLength(valueLength);
token.SetStartOffset(offset);
token.SetEndOffset(offset + stringValue.Length);
AddPosition(token);
offset += stringValue.Length;
length++;
}
else
{
// tokenized field
TokenStream stream;
TokenStream streamValue = field.TokenStreamValue();
if (streamValue != null)
stream = streamValue;
else
{
// the field does not have a TokenStream,
// so we have to obtain one from the analyzer
System.IO.TextReader reader; // find or make Reader
System.IO.TextReader readerValue = field.ReaderValue();
if (readerValue != null)
reader = readerValue;
else
{
System.String stringValue = field.StringValue();
if (stringValue == null)
throw new System.ArgumentException("field must have either TokenStream, String or Reader value");
Enclosing_Instance.stringReader.Init(stringValue);
reader = Enclosing_Instance.stringReader;
}
// Tokenize field and add to postingTable
stream = analyzer.ReusableTokenStream(fieldInfo.name, reader);
}
// reset the TokenStream to the first token
stream.Reset();
try
{
offsetEnd = offset - 1;
Token token;
for (; ; )
{
token = stream.Next(localToken);
if (token == null)
break;
position += (token.GetPositionIncrement() - 1);
AddPosition(token);
if (++length >= maxFieldLength)
{
if (Enclosing_Instance.Enclosing_Instance.infoStream != null)
Enclosing_Instance.Enclosing_Instance.infoStream.WriteLine("maxFieldLength " + maxFieldLength + " reached for field " + fieldInfo.name + ", ignoring following tokens");
break;
}
}
offset = offsetEnd + 1;
}
finally
{
stream.Close();
}
}
boost *= field.GetBoost();
}
/// <summary>Only called when term vectors are enabled. This
/// is called the first time we see a given term for
/// each * document, to allocate a PostingVector
/// instance that * is used to record data needed to
/// write the posting * vectors.
/// </summary>
private PostingVector AddNewVector()
{
if (postingsVectorsUpto == Enclosing_Instance.postingsVectors.Length)
{
int newSize;
if (Enclosing_Instance.postingsVectors.Length < 2)
newSize = 2;
else
{
newSize = (int) (1.5 * Enclosing_Instance.postingsVectors.Length);
}
PostingVector[] newArray = new PostingVector[newSize];
Array.Copy(Enclosing_Instance.postingsVectors, 0, newArray, 0, Enclosing_Instance.postingsVectors.Length);
Enclosing_Instance.postingsVectors = newArray;
}
Enclosing_Instance.p.vector = Enclosing_Instance.postingsVectors[postingsVectorsUpto];
if (Enclosing_Instance.p.vector == null)
Enclosing_Instance.p.vector = Enclosing_Instance.postingsVectors[postingsVectorsUpto] = new PostingVector();
postingsVectorsUpto++;
PostingVector v = Enclosing_Instance.p.vector;
v.p = Enclosing_Instance.p;
int firstSize = Lucene.Net.Index.DocumentsWriter.levelSizeArray[0];
if (doVectorPositions)
{
int upto = Enclosing_Instance.vectorsPool.NewSlice(firstSize);
v.posStart = v.posUpto = Enclosing_Instance.vectorsPool.byteOffset + upto;
}
if (doVectorOffsets)
{
int upto = Enclosing_Instance.vectorsPool.NewSlice(firstSize);
v.offsetStart = v.offsetUpto = Enclosing_Instance.vectorsPool.byteOffset + upto;
}
return v;
}
internal int offsetStartCode;
internal int offsetStart;
/// <summary>This is the hotspot of indexing: it's called once
/// for every term of every document. Its job is to *
/// update the postings byte stream (Postings hash) *
/// based on the occurence of a single term.
/// </summary>
private void AddPosition(Token token)
{
Payload payload = token.GetPayload();
// Get the text of this term. Term can either
// provide a String token or offset into a char[]
// array
char[] tokenText = token.TermBuffer();
int tokenTextLen = token.TermLength();
int code = 0;
// Compute hashcode
int downto = tokenTextLen;
while (downto > 0)
code = (code * 31) + tokenText[--downto];
// System.out.println(" addPosition: buffer=" + new String(tokenText, 0, tokenTextLen) + " pos=" + position + " offsetStart=" + (offset+token.startOffset()) + " offsetEnd=" + (offset + token.endOffset()) + " docID=" + docID + " doPos=" + doVectorPositions + " doOffset=" + doVectorOffsets);
int hashPos = code & postingsHashMask;
System.Diagnostics.Debug.Assert(!postingsCompacted);
// Locate Posting in hash
Enclosing_Instance.p = postingsHash[hashPos];
if (Enclosing_Instance.p != null && !Enclosing_Instance.PostingEquals(tokenText, tokenTextLen))
{
// Conflict: keep searching different locations in
// the hash table.
int inc = ((code >> 8) + code) | 1;
do
{
code += inc;
hashPos = code & postingsHashMask;
Enclosing_Instance.p = postingsHash[hashPos];
}
while (Enclosing_Instance.p != null && !Enclosing_Instance.PostingEquals(tokenText, tokenTextLen));
}
int proxCode;
// If we hit an exception below, it's possible the
// posting list or term vectors data will be
// partially written and thus inconsistent if
// flushed, so we have to abort all documents
// since the last flush:
try
{
if (Enclosing_Instance.p != null)
{
// term seen since last flush
if (Enclosing_Instance.docID != Enclosing_Instance.p.lastDocID)
{
// term not yet seen in this doc
// System.out.println(" seen before (new docID=" + docID + ") freqUpto=" + p.freqUpto +" proxUpto=" + p.proxUpto);
System.Diagnostics.Debug.Assert(Enclosing_Instance.p.docFreq > 0);
// Now that we know doc freq for previous doc,
// write it & lastDocCode
Enclosing_Instance.freqUpto = Enclosing_Instance.p.freqUpto & Lucene.Net.Index.DocumentsWriter.BYTE_BLOCK_MASK;
Enclosing_Instance.freq = Enclosing_Instance.postingsPool.buffers[Enclosing_Instance.p.freqUpto >> Lucene.Net.Index.DocumentsWriter.BYTE_BLOCK_SHIFT];
if (1 == Enclosing_Instance.p.docFreq)
Enclosing_Instance.WriteFreqVInt(Enclosing_Instance.p.lastDocCode | 1);
else
{
Enclosing_Instance.WriteFreqVInt(Enclosing_Instance.p.lastDocCode);
Enclosing_Instance.WriteFreqVInt(Enclosing_Instance.p.docFreq);
}
Enclosing_Instance.p.freqUpto = Enclosing_Instance.freqUpto + (Enclosing_Instance.p.freqUpto & Lucene.Net.Index.DocumentsWriter.BYTE_BLOCK_NOT_MASK);
if (doVectors)
{
Enclosing_Instance.vector = AddNewVector();
if (doVectorOffsets)
{
offsetStartCode = offsetStart = offset + token.StartOffset();
offsetEnd = offset + token.EndOffset();
}
}
proxCode = position;
Enclosing_Instance.p.docFreq = 1;
// Store code so we can write this after we're
// done with this new doc
Enclosing_Instance.p.lastDocCode = (Enclosing_Instance.docID - Enclosing_Instance.p.lastDocID) << 1;
Enclosing_Instance.p.lastDocID = Enclosing_Instance.docID;
}
else
{
// term already seen in this doc
// System.out.println(" seen before (same docID=" + docID + ") proxUpto=" + p.proxUpto);
Enclosing_Instance.p.docFreq++;
proxCode = position - Enclosing_Instance.p.lastPosition;
if (doVectors)
{
Enclosing_Instance.vector = Enclosing_Instance.p.vector;
if (Enclosing_Instance.vector == null)
Enclosing_Instance.vector = AddNewVector();
if (doVectorOffsets)
{
offsetStart = offset + token.StartOffset();
offsetEnd = offset + token.EndOffset();
offsetStartCode = offsetStart - Enclosing_Instance.vector.lastOffset;
}
}
}
}
else
{
// term not seen before
// System.out.println(" never seen docID=" + docID);
// Refill?
if (0 == Enclosing_Instance.postingsFreeCount)
{
Enclosing_Instance.Enclosing_Instance.GetPostings(Enclosing_Instance.postingsFreeList);
Enclosing_Instance.postingsFreeCount = Enclosing_Instance.postingsFreeList.Length;
}
int textLen1 = 1 + tokenTextLen;
if (textLen1 + Enclosing_Instance.charPool.byteUpto > Lucene.Net.Index.DocumentsWriter.CHAR_BLOCK_SIZE)
{
if (textLen1 > Lucene.Net.Index.DocumentsWriter.CHAR_BLOCK_SIZE)
{
// Just skip this term, to remain as robust as
// possible during indexing. A TokenFilter
// can be inserted into the analyzer chain if
// other behavior is wanted (pruning the term
// to a prefix, throwing an exception, etc).
if (Enclosing_Instance.maxTermPrefix == null)
Enclosing_Instance.maxTermPrefix = new System.String(tokenText, 0, 30);
// Still increment position:
position++;
return ;
}
Enclosing_Instance.charPool.NextBuffer();
}
char[] text = Enclosing_Instance.charPool.buffer;
int textUpto = Enclosing_Instance.charPool.byteUpto;
// Pull next free Posting from free list
Enclosing_Instance.p = Enclosing_Instance.postingsFreeList[--Enclosing_Instance.postingsFreeCount];
Enclosing_Instance.p.textStart = textUpto + Enclosing_Instance.charPool.byteOffset;
Enclosing_Instance.charPool.byteUpto += textLen1;
Array.Copy(tokenText, 0, text, textUpto, tokenTextLen);
text[textUpto + tokenTextLen] = (char) (0xffff);
System.Diagnostics.Debug.Assert(postingsHash [hashPos] == null);
postingsHash[hashPos] = Enclosing_Instance.p;
numPostings++;
if (numPostings == postingsHashHalfSize)
RehashPostings(2 * postingsHashSize);
// Init first slice for freq & prox streams
int firstSize = Lucene.Net.Index.DocumentsWriter.levelSizeArray[0];
int upto1 = Enclosing_Instance.postingsPool.NewSlice(firstSize);
Enclosing_Instance.p.freqStart = Enclosing_Instance.p.freqUpto = Enclosing_Instance.postingsPool.byteOffset + upto1;
int upto2 = Enclosing_Instance.postingsPool.NewSlice(firstSize);
Enclosing_Instance.p.proxStart = Enclosing_Instance.p.proxUpto = Enclosing_Instance.postingsPool.byteOffset + upto2;
Enclosing_Instance.p.lastDocCode = Enclosing_Instance.docID << 1;
Enclosing_Instance.p.lastDocID = Enclosing_Instance.docID;
Enclosing_Instance.p.docFreq = 1;
if (doVectors)
{
Enclosing_Instance.vector = AddNewVector();
if (doVectorOffsets)
{
offsetStart = offsetStartCode = offset + token.StartOffset();
offsetEnd = offset + token.EndOffset();
}
}
proxCode = position;
}
Enclosing_Instance.proxUpto = Enclosing_Instance.p.proxUpto & Lucene.Net.Index.DocumentsWriter.BYTE_BLOCK_MASK;
Enclosing_Instance.prox = Enclosing_Instance.postingsPool.buffers[Enclosing_Instance.p.proxUpto >> Lucene.Net.Index.DocumentsWriter.BYTE_BLOCK_SHIFT];
System.Diagnostics.Debug.Assert(Enclosing_Instance.prox != null);
if (payload != null && payload.length > 0)
{
Enclosing_Instance.WriteProxVInt((proxCode << 1) | 1);
Enclosing_Instance.WriteProxVInt(payload.length);
Enclosing_Instance.WriteProxBytes(payload.data, payload.offset, payload.length);
fieldInfo.storePayloads = true;
}
else
Enclosing_Instance.WriteProxVInt(proxCode << 1);
Enclosing_Instance.p.proxUpto = Enclosing_Instance.proxUpto + (Enclosing_Instance.p.proxUpto & Lucene.Net.Index.DocumentsWriter.BYTE_BLOCK_NOT_MASK);
Enclosing_Instance.p.lastPosition = position++;
if (doVectorPositions)
{
Enclosing_Instance.posUpto = Enclosing_Instance.vector.posUpto & Lucene.Net.Index.DocumentsWriter.BYTE_BLOCK_MASK;
Enclosing_Instance.pos = Enclosing_Instance.vectorsPool.buffers[Enclosing_Instance.vector.posUpto >> Lucene.Net.Index.DocumentsWriter.BYTE_BLOCK_SHIFT];
Enclosing_Instance.WritePosVInt(proxCode);
Enclosing_Instance.vector.posUpto = Enclosing_Instance.posUpto + (Enclosing_Instance.vector.posUpto & Lucene.Net.Index.DocumentsWriter.BYTE_BLOCK_NOT_MASK);
}
if (doVectorOffsets)
{
Enclosing_Instance.offsetUpto = Enclosing_Instance.vector.offsetUpto & Lucene.Net.Index.DocumentsWriter.BYTE_BLOCK_MASK;
Enclosing_Instance.offsets = Enclosing_Instance.vectorsPool.buffers[Enclosing_Instance.vector.offsetUpto >> Lucene.Net.Index.DocumentsWriter.BYTE_BLOCK_SHIFT];
Enclosing_Instance.WriteOffsetVInt(offsetStartCode);
Enclosing_Instance.WriteOffsetVInt(offsetEnd - offsetStart);
Enclosing_Instance.vector.lastOffset = offsetEnd;
Enclosing_Instance.vector.offsetUpto = Enclosing_Instance.offsetUpto + (Enclosing_Instance.vector.offsetUpto & Lucene.Net.Index.DocumentsWriter.BYTE_BLOCK_NOT_MASK);
}
}
catch (System.Exception t)
{
throw new AbortException(t, Enclosing_Instance.Enclosing_Instance);
}
}
/// <summary>Called when postings hash is too small (> 50%
/// occupied) or too large (< 20% occupied).
/// </summary>
internal void RehashPostings(int newSize)
{
int newMask = newSize - 1;
Posting[] newHash = new Posting[newSize];
for (int i = 0; i < postingsHashSize; i++)
{
Posting p0 = postingsHash[i];
if (p0 != null)
{
int start = p0.textStart & Lucene.Net.Index.DocumentsWriter.CHAR_BLOCK_MASK;
char[] text = Enclosing_Instance.charPool.buffers[p0.textStart >> Lucene.Net.Index.DocumentsWriter.CHAR_BLOCK_SHIFT];
int pos = start;
while (text[pos] != 0xffff)
pos++;
int code = 0;
while (pos > start)
code = (code * 31) + text[--pos];
int hashPos = code & newMask;
System.Diagnostics.Debug.Assert(hashPos >= 0);
if (newHash[hashPos] != null)
{
int inc = ((code >> 8) + code) | 1;
do
{
code += inc;
hashPos = code & newMask;
}
while (newHash[hashPos] != null);
}
newHash[hashPos] = p0;
}
}
postingsHashMask = newMask;
postingsHash = newHash;
postingsHashSize = newSize;
postingsHashHalfSize = newSize >> 1;
}
internal ByteSliceReader vectorSliceReader = new ByteSliceReader();
/// <summary>Called once per field per document if term vectors
/// are enabled, to write the vectors to *
/// RAMOutputStream, which is then quickly flushed to
/// * the real term vectors files in the Directory.
/// </summary>
internal void WriteVectors(FieldInfo fieldInfo)
{
System.Diagnostics.Debug.Assert(fieldInfo.storeTermVector);
Enclosing_Instance.vectorFieldNumbers[Enclosing_Instance.numVectorFields] = fieldInfo.number;
Enclosing_Instance.vectorFieldPointers[Enclosing_Instance.numVectorFields] = Enclosing_Instance.tvfLocal.GetFilePointer();
Enclosing_Instance.numVectorFields++;
int numPostingsVectors = postingsVectorsUpto;
Enclosing_Instance.tvfLocal.WriteVInt(numPostingsVectors);
byte bits = (byte) (0x0);
if (doVectorPositions)
bits |= TermVectorsReader.STORE_POSITIONS_WITH_TERMVECTOR;
if (doVectorOffsets)
bits |= TermVectorsReader.STORE_OFFSET_WITH_TERMVECTOR;
Enclosing_Instance.tvfLocal.WriteByte(bits);
Enclosing_Instance.DoVectorSort(Enclosing_Instance.postingsVectors, numPostingsVectors);
Posting lastPosting = null;
ByteSliceReader reader = vectorSliceReader;
for (int j = 0; j < numPostingsVectors; j++)
{
PostingVector vector = Enclosing_Instance.postingsVectors[j];
Posting posting = vector.p;
int freq = posting.docFreq;
int prefix;
char[] text2 = Enclosing_Instance.charPool.buffers[posting.textStart >> Lucene.Net.Index.DocumentsWriter.CHAR_BLOCK_SHIFT];
int start2 = posting.textStart & Lucene.Net.Index.DocumentsWriter.CHAR_BLOCK_MASK;
int pos2 = start2;
// Compute common prefix between last term and
// this term
if (lastPosting == null)
prefix = 0;
else
{
char[] text1 = Enclosing_Instance.charPool.buffers[lastPosting.textStart >> Lucene.Net.Index.DocumentsWriter.CHAR_BLOCK_SHIFT];
int start1 = lastPosting.textStart & Lucene.Net.Index.DocumentsWriter.CHAR_BLOCK_MASK;
int pos1 = start1;
while (true)
{
char c1 = text1[pos1];
char c2 = text2[pos2];
if (c1 != c2 || c1 == 0xffff)
{
prefix = pos1 - start1;
break;
}
pos1++;
pos2++;
}
}
lastPosting = posting;
// Compute length
while (text2[pos2] != 0xffff)
pos2++;
int suffix = pos2 - start2 - prefix;
Enclosing_Instance.tvfLocal.WriteVInt(prefix);
Enclosing_Instance.tvfLocal.WriteVInt(suffix);
Enclosing_Instance.tvfLocal.WriteChars(text2, start2 + prefix, suffix);
Enclosing_Instance.tvfLocal.WriteVInt(freq);
if (doVectorPositions)
{
reader.Init(Enclosing_Instance.vectorsPool, vector.posStart, vector.posUpto);
reader.WriteTo(Enclosing_Instance.tvfLocal);
}
if (doVectorOffsets)
{
reader.Init(Enclosing_Instance.vectorsPool, vector.offsetStart, vector.offsetUpto);
reader.WriteTo(Enclosing_Instance.tvfLocal);
}
}
}
}
}
private static readonly byte defaultNorm;
/// <summary>Write norms in the "true" segment format. This is
/// called only during commit, to create the .nrm file.
/// </summary>
internal void WriteNorms(System.String segmentName, int totalNumDoc)
{
IndexOutput normsOut = directory.CreateOutput(segmentName + "." + IndexFileNames.NORMS_EXTENSION);
try
{
normsOut.WriteBytes(SegmentMerger.NORMS_HEADER, 0, SegmentMerger.NORMS_HEADER.Length);
int numField = fieldInfos.Size();
for (int fieldIdx = 0; fieldIdx < numField; fieldIdx++)
{
FieldInfo fi = fieldInfos.FieldInfo(fieldIdx);
if (fi.isIndexed && !fi.omitNorms)
{
BufferedNorms n = norms[fieldIdx];
long v;
if (n == null)
v = 0;
else
{
v = n.out_Renamed.GetFilePointer();
n.out_Renamed.WriteTo(normsOut);
n.Reset();
}
if (v < totalNumDoc)
FillBytes(normsOut, defaultNorm, (int) (totalNumDoc - v));
}
}
}
finally
{
normsOut.Close();
}
}
private DefaultSkipListWriter skipListWriter = null;
private bool currentFieldStorePayloads;
/// <summary>Creates a segment from all Postings in the Postings
/// hashes across all ThreadStates & FieldDatas.
/// </summary>
private System.Collections.IList WriteSegment()
{
System.Diagnostics.Debug.Assert(AllThreadsIdle());
System.Diagnostics.Debug.Assert(nextDocID == numDocsInRAM);
System.String segmentName;
segmentName = segment;
TermInfosWriter termsOut = new TermInfosWriter(directory, segmentName, fieldInfos, writer.GetTermIndexInterval());
IndexOutput freqOut = directory.CreateOutput(segmentName + ".frq");
IndexOutput proxOut = directory.CreateOutput(segmentName + ".prx");
// Gather all FieldData's that have postings, across all
// ThreadStates
System.Collections.ArrayList allFields = new System.Collections.ArrayList();
System.Diagnostics.Debug.Assert(AllThreadsIdle());
for (int i = 0; i < threadStates.Length; i++)
{
ThreadState state = threadStates[i];
state.TrimFields();
int numFields = state.numAllFieldData;
for (int j = 0; j < numFields; j++)
{
ThreadState.FieldData fp = state.allFieldDataArray[j];
if (fp.numPostings > 0)
allFields.Add(fp);
}
}
// Sort by field name
allFields.Sort();
int numAllFields = allFields.Count;
skipListWriter = new DefaultSkipListWriter(termsOut.skipInterval, termsOut.maxSkipLevels, numDocsInRAM, freqOut, proxOut);
int start = 0;
while (start < numAllFields)
{
System.String fieldName = ((ThreadState.FieldData) allFields[start]).fieldInfo.name;
int end = start + 1;
while (end < numAllFields && ((ThreadState.FieldData) allFields[end]).fieldInfo.name.Equals(fieldName))
end++;
ThreadState.FieldData[] fields = new ThreadState.FieldData[end - start];
for (int i = start; i < end; i++)
fields[i - start] = (ThreadState.FieldData) allFields[i];
// If this field has postings then add them to the
// segment
AppendPostings(fields, termsOut, freqOut, proxOut);
for (int i = 0; i < fields.Length; i++)
fields[i].ResetPostingArrays();
start = end;
}
freqOut.Close();
proxOut.Close();
termsOut.Close();
// Record all files we have flushed
System.Collections.IList flushedFiles = new System.Collections.ArrayList();
flushedFiles.Add(SegmentFileName(IndexFileNames.FIELD_INFOS_EXTENSION));
flushedFiles.Add(SegmentFileName(IndexFileNames.FREQ_EXTENSION));
flushedFiles.Add(SegmentFileName(IndexFileNames.PROX_EXTENSION));
flushedFiles.Add(SegmentFileName(IndexFileNames.TERMS_EXTENSION));
flushedFiles.Add(SegmentFileName(IndexFileNames.TERMS_INDEX_EXTENSION));
if (hasNorms)
{
WriteNorms(segmentName, numDocsInRAM);
flushedFiles.Add(SegmentFileName(IndexFileNames.NORMS_EXTENSION));
}
if (infoStream != null)
{
long newSegmentSize = SegmentSize(segmentName);
System.String message = String.Format(nf, " oldRAMSize={0:d} newFlushedSize={1:d} docs/MB={2:f} new/old={3:%}",
new Object[] { numBytesUsed, newSegmentSize, (numDocsInRAM / (newSegmentSize / 1024.0 / 1024.0)), (newSegmentSize / numBytesUsed) });
infoStream.WriteLine(message);
}
ResetPostingsData();
nextDocID = 0;
nextWriteDocID = 0;
numDocsInRAM = 0;
files = null;
// Maybe downsize postingsFreeList array
if (postingsFreeList.Length > 1.5 * postingsFreeCount)
{
int newSize = postingsFreeList.Length;
while (newSize > 1.25 * postingsFreeCount)
{
newSize = (int) (newSize * 0.8);
}
Posting[] newArray = new Posting[newSize];
Array.Copy(postingsFreeList, 0, newArray, 0, postingsFreeCount);
postingsFreeList = newArray;
}
return flushedFiles;
}
/// <summary>Returns the name of the file with this extension, on
/// the current segment we are working on.
/// </summary>
private System.String SegmentFileName(System.String extension)
{
return segment + "." + extension;
}
private TermInfo termInfo = new TermInfo(); // minimize consing
/// <summary>Used to merge the postings from multiple ThreadStates
/// when creating a segment
/// </summary>
internal sealed class FieldMergeState
{
internal ThreadState.FieldData field;
internal Posting[] postings;
private Posting p;
internal char[] text;
internal int textOffset;
private int postingUpto = - 1;
private ByteSliceReader freq = new ByteSliceReader();
internal ByteSliceReader prox = new ByteSliceReader();
internal int docID;
internal int termFreq;
internal bool NextTerm()
{
postingUpto++;
if (postingUpto == field.numPostings)
return false;
p = postings[postingUpto];
docID = 0;
text = field.threadState.charPool.buffers[p.textStart >> Lucene.Net.Index.DocumentsWriter.CHAR_BLOCK_SHIFT];
textOffset = p.textStart & Lucene.Net.Index.DocumentsWriter.CHAR_BLOCK_MASK;
if (p.freqUpto > p.freqStart)
freq.Init(field.threadState.postingsPool, p.freqStart, p.freqUpto);
else
freq.bufferOffset = freq.upto = freq.endIndex = 0;
prox.Init(field.threadState.postingsPool, p.proxStart, p.proxUpto);
// Should always be true
bool result = NextDoc();
System.Diagnostics.Debug.Assert(result);
return true;
}
public bool NextDoc()
{
if (freq.bufferOffset + freq.upto == freq.endIndex)
{
if (p.lastDocCode != - 1)
{
// Return last doc
docID = p.lastDocID;
termFreq = p.docFreq;
p.lastDocCode = - 1;
return true;
}
// EOF
else
return false;
}
int code = freq.ReadVInt();
docID += SupportClass.Number.URShift(code, 1);
if ((code & 1) != 0)
termFreq = 1;
else
termFreq = freq.ReadVInt();
return true;
}
}
internal int CompareText(char[] text1, int pos1, char[] text2, int pos2)
{
while (true)
{
char c1 = text1[pos1++];
char c2 = text2[pos2++];
if (c1 < c2)
if (0xffff == c2)
return 1;
else
return - 1;
else if (c2 < c1)
if (0xffff == c1)
return - 1;
else
return 1;
else if (0xffff == c1)
return 0;
}
}
/* Walk through all unique text tokens (Posting
* instances) found in this field and serialize them
* into a single RAM segment. */
internal void AppendPostings(ThreadState.FieldData[] fields, TermInfosWriter termsOut, IndexOutput freqOut, IndexOutput proxOut)
{
int fieldNumber = fields[0].fieldInfo.number;
int numFields = fields.Length;
FieldMergeState[] mergeStates = new FieldMergeState[numFields];
for (int i = 0; i < numFields; i++)
{
FieldMergeState fms = mergeStates[i] = new FieldMergeState();
fms.field = fields[i];
fms.postings = fms.field.SortPostings();
System.Diagnostics.Debug.Assert(fms.field.fieldInfo == fields [0].fieldInfo);
// Should always be true
bool result = fms.NextTerm();
System.Diagnostics.Debug.Assert(result);
}
int skipInterval = termsOut.skipInterval;
currentFieldStorePayloads = fields[0].fieldInfo.storePayloads;
FieldMergeState[] termStates = new FieldMergeState[numFields];
while (numFields > 0)
{
// Get the next term to merge
termStates[0] = mergeStates[0];
int numToMerge = 1;
for (int i = 1; i < numFields; i++)
{
char[] text = mergeStates[i].text;
int textOffset = mergeStates[i].textOffset;
int cmp = CompareText(text, textOffset, termStates[0].text, termStates[0].textOffset);
if (cmp < 0)
{
termStates[0] = mergeStates[i];
numToMerge = 1;
}
else if (cmp == 0)
termStates[numToMerge++] = mergeStates[i];
}
int df = 0;
int lastPayloadLength = - 1;
int lastDoc = 0;
char[] text2 = termStates[0].text;
int start = termStates[0].textOffset;
int pos = start;
while (text2[pos] != 0xffff)
pos++;
long freqPointer = freqOut.GetFilePointer();
long proxPointer = proxOut.GetFilePointer();
skipListWriter.ResetSkip();
// Now termStates has numToMerge FieldMergeStates
// which all share the same term. Now we must
// interleave the docID streams.
while (numToMerge > 0)
{
if ((++df % skipInterval) == 0)
{
skipListWriter.SetSkipData(lastDoc, currentFieldStorePayloads, lastPayloadLength);
skipListWriter.BufferSkip(df);
}
FieldMergeState minState = termStates[0];
for (int i = 1; i < numToMerge; i++)
if (termStates[i].docID < minState.docID)
minState = termStates[i];
int doc = minState.docID;
int termDocFreq = minState.termFreq;
System.Diagnostics.Debug.Assert(doc < numDocsInRAM);
System.Diagnostics.Debug.Assert(doc > lastDoc || df == 1);
int newDocCode = (doc - lastDoc) << 1;
lastDoc = doc;
ByteSliceReader prox = minState.prox;
// Carefully copy over the prox + payload info,
// changing the format to match Lucene's segment
// format.
for (int j = 0; j < termDocFreq; j++)
{
int code = prox.ReadVInt();
if (currentFieldStorePayloads)
{
int payloadLength;
if ((code & 1) != 0)
{
// This position has a payload
payloadLength = prox.ReadVInt();
}
else
payloadLength = 0;
if (payloadLength != lastPayloadLength)
{
proxOut.WriteVInt(code | 1);
proxOut.WriteVInt(payloadLength);
lastPayloadLength = payloadLength;
}
else
proxOut.WriteVInt(code & (~ 1));
if (payloadLength > 0)
CopyBytes(prox, proxOut, payloadLength);
}
else
{
System.Diagnostics.Debug.Assert(0 ==(code & 1));
proxOut.WriteVInt(code >> 1);
}
}
if (1 == termDocFreq)
{
freqOut.WriteVInt(newDocCode | 1);
}
else
{
freqOut.WriteVInt(newDocCode);
freqOut.WriteVInt(termDocFreq);
}
if (!minState.NextDoc())
{
// Remove from termStates
int upto = 0;
for (int i = 0; i < numToMerge; i++)
if (termStates[i] != minState)
termStates[upto++] = termStates[i];
numToMerge--;
System.Diagnostics.Debug.Assert(upto == numToMerge);
// Advance this state to the next term
if (!minState.NextTerm())
{
// OK, no more terms, so remove from mergeStates
// as well
upto = 0;
for (int i = 0; i < numFields; i++)
if (mergeStates[i] != minState)
mergeStates[upto++] = mergeStates[i];
numFields--;
System.Diagnostics.Debug.Assert(upto == numFields);
}
}
}
System.Diagnostics.Debug.Assert(df > 0);
// Done merging this term
long skipPointer = skipListWriter.WriteSkip(freqOut);
// Write term
termInfo.Set(df, freqPointer, proxPointer, (int) (skipPointer - freqPointer));
termsOut.Add(fieldNumber, text2, start, pos - start, termInfo);
}
}
internal void Close()
{
lock (this)
{
closed = true;
System.Threading.Monitor.PulseAll(this);
}
}
/// <summary>Returns a free (idle) ThreadState that may be used for
/// indexing this one document. This call also pauses if a
/// flush is pending. If delTerm is non-null then we
/// buffer this deleted term after the thread state has
/// been acquired.
/// </summary>
internal ThreadState GetThreadState(Document doc, Term delTerm)
{
lock (this)
{
// First, find a thread state. If this thread already
// has affinity to a specific ThreadState, use that one
// again.
ThreadState state = (ThreadState) threadBindings[SupportClass.ThreadClass.Current()];
if (state == null)
{
// First time this thread has called us since last flush
ThreadState minThreadState = null;
for (int i = 0; i < threadStates.Length; i++)
{
ThreadState ts = threadStates[i];
if (minThreadState == null || ts.numThreads < minThreadState.numThreads)
minThreadState = ts;
}
if (minThreadState != null && (minThreadState.numThreads == 0 || threadStates.Length == MAX_THREAD_STATE))
{
state = minThreadState;
state.numThreads++;
}
else
{
// Just create a new "private" thread state
ThreadState[] newArray = new ThreadState[1 + threadStates.Length];
if (threadStates.Length > 0)
Array.Copy(threadStates, 0, newArray, 0, threadStates.Length);
state = newArray[threadStates.Length] = new ThreadState(this);
threadStates = newArray;
}
threadBindings[SupportClass.ThreadClass.Current()] = state;
}
// Next, wait until my thread state is idle (in case
// it's shared with other threads) and for threads to
// not be paused nor a flush pending:
while (!closed && (!state.isIdle || pauseThreads != 0 || flushPending || abortCount > 0))
try
{
System.Threading.Monitor.Wait(this);
}
catch (System.Threading.ThreadInterruptedException)
{
SupportClass.ThreadClass.Current().Interrupt();
}
if (closed)
throw new AlreadyClosedException("this IndexWriter is closed");
if (segment == null)
segment = writer.NewSegmentName();
numDocsInRAM++;
// We must at this point commit to flushing to ensure we
// always get N docs when we flush by doc count, even if
// > 1 thread is adding documents:
if (!flushPending && maxBufferedDocs != IndexWriter.DISABLE_AUTO_FLUSH && numDocsInRAM >= maxBufferedDocs)
{
flushPending = true;
state.doFlushAfter = true;
}
else
state.doFlushAfter = false;
state.isIdle = false;
try
{
bool success = false;
try
{
state.Init(doc, nextDocID);
if (delTerm != null)
{
AddDeleteTerm(delTerm, state.docID);
if (!state.doFlushAfter)
state.doFlushAfter = TimeToFlushDeletes();
}
// Only increment nextDocID on successful init
nextDocID++;
success = true;
}
finally
{
if (!success)
{
// Forcefully idle this ThreadState:
state.isIdle = true;
System.Threading.Monitor.PulseAll(this);
if (state.doFlushAfter)
{
state.doFlushAfter = false;
flushPending = false;
}
}
}
}
catch (AbortException ae)
{
Abort(ae);
}
return state;
}
}
/// <summary>Returns true if the caller (IndexWriter) should now
/// flush.
/// </summary>
internal bool AddDocument(Document doc, Analyzer analyzer)
{
return UpdateDocument(doc, analyzer, null);
}
internal bool UpdateDocument(Term t, Document doc, Analyzer analyzer)
{
return UpdateDocument(doc, analyzer, t);
}
internal bool UpdateDocument(Document doc, Analyzer analyzer, Term delTerm)
{
// This call is synchronized but fast
ThreadState state = GetThreadState(doc, delTerm);
try
{
bool success = false;
try
{
try
{
// This call is not synchronized and does all the work
state.ProcessDocument(analyzer);
}
finally
{
// This call is synchronized but fast
FinishDocument(state);
}
success = true;
}
finally
{
if (!success)
{
lock (this)
{
// Immediately mark this document as deleted
// since likely it was partially added. This
// keeps indexing as "all or none" (atomic) when
// adding a document:
AddDeleteDocID(state.docID);
}
}
}
}
catch (AbortException ae)
{
Abort(ae);
}
return state.doFlushAfter || TimeToFlushDeletes();
}
internal int GetNumBufferedDeleteTerms()
{
lock (this)
{
return numBufferedDeleteTerms;
}
}
internal System.Collections.Hashtable GetBufferedDeleteTerms()
{
lock (this)
{
return bufferedDeleteTerms;
}
}
internal System.Collections.IList GetBufferedDeleteDocIDs()
{
lock (this)
{
return bufferedDeleteDocIDs;
}
}
// Reset buffered deletes.
internal void ClearBufferedDeletes()
{
lock (this)
{
bufferedDeleteTerms.Clear();
bufferedDeleteDocIDs.Clear();
numBufferedDeleteTerms = 0;
if (numBytesUsed > 0)
ResetPostingsData();
}
}
internal bool BufferDeleteTerms(Term[] terms)
{
lock (this)
{
while (pauseThreads != 0 || flushPending)
try
{
System.Threading.Monitor.Wait(this);
}
catch (System.Threading.ThreadInterruptedException)
{
SupportClass.ThreadClass.Current().Interrupt();
}
for (int i = 0; i < terms.Length; i++)
AddDeleteTerm(terms[i], numDocsInRAM);
return TimeToFlushDeletes();
}
}
internal bool BufferDeleteTerm(Term term)
{
lock (this)
{
while (pauseThreads != 0 || flushPending)
try
{
System.Threading.Monitor.Wait(this);
}
catch (System.Threading.ThreadInterruptedException)
{
SupportClass.ThreadClass.Current().Interrupt();
}
AddDeleteTerm(term, numDocsInRAM);
return TimeToFlushDeletes();
}
}
private bool TimeToFlushDeletes()
{
lock (this)
{
return (bufferIsFull || (maxBufferedDeleteTerms != IndexWriter.DISABLE_AUTO_FLUSH && numBufferedDeleteTerms >= maxBufferedDeleteTerms)) && SetFlushPending();
}
}
internal void SetMaxBufferedDeleteTerms(int maxBufferedDeleteTerms)
{
this.maxBufferedDeleteTerms = maxBufferedDeleteTerms;
}
internal int GetMaxBufferedDeleteTerms()
{
return maxBufferedDeleteTerms;
}
internal bool HasDeletes()
{
lock (this)
{
return bufferedDeleteTerms.Count > 0 || bufferedDeleteDocIDs.Count > 0;
}
}
// Number of documents a delete term applies to.
internal class Num
{
private int num;
internal Num(int num)
{
this.num = num;
}
internal int GetNum()
{
return num;
}
internal void SetNum(int num)
{
// Only record the new number if it's greater than the
// current one. This is important because if multiple
// threads are replacing the same doc at nearly the
// same time, it's possible that one thread that got a
// higher docID is scheduled before the other
// threads.
if (num > this.num)
this.num = num;
}
}
// Buffer a term in bufferedDeleteTerms, which records the
// current number of documents buffered in ram so that the
// delete term will be applied to those documents as well
// as the disk segments.
private void AddDeleteTerm(Term term, int docCount)
{
lock (this)
{
Num num = (Num) bufferedDeleteTerms[term];
if (num == null)
{
bufferedDeleteTerms[term] = new Num(docCount);
// This is coarse approximation of actual bytes used:
numBytesUsed += (term.Field().Length + term.Text().Length) * BYTES_PER_CHAR + 4 + 5 * OBJECT_HEADER_BYTES + 5 * OBJECT_POINTER_BYTES;
if (ramBufferSize != IndexWriter.DISABLE_AUTO_FLUSH && numBytesUsed > ramBufferSize)
{
bufferIsFull = true;
}
}
else
{
num.SetNum(docCount);
}
numBufferedDeleteTerms++;
}
}
// Buffer a specific docID for deletion. Currently only
// used when we hit a exception when adding a document
private void AddDeleteDocID(int docId)
{
lock (this)
{
bufferedDeleteDocIDs.Add((System.Int32) docId);
numBytesUsed += OBJECT_HEADER_BYTES + BYTES_PER_INT + OBJECT_POINTER_BYTES;
}
}
/// <summary>Does the synchronized work to finish/flush the
/// inverted document.
/// </summary>
private void FinishDocument(ThreadState state)
{
lock (this)
{
if (abortCount > 0)
{
// Forcefully idle this threadstate -- its state will
// be reset by abort()
state.isIdle = true;
System.Threading.Monitor.PulseAll(this);
return ;
}
// Now write the indexed document to the real files.
if (nextWriteDocID == state.docID)
{
// It's my turn, so write everything now:
nextWriteDocID++;
state.WriteDocument();
state.isIdle = true;
System.Threading.Monitor.PulseAll(this);
// If any states were waiting on me, sweep through and
// flush those that are enabled by my write.
if (numWaiting > 0)
{
bool any = true;
while (any)
{
any = false;
for (int i = 0; i < numWaiting; )
{
ThreadState s = waitingThreadStates[i];
if (s.docID == nextWriteDocID)
{
s.WriteDocument();
s.isIdle = true;
nextWriteDocID++;
any = true;
if (numWaiting > i + 1)
// Swap in the last waiting state to fill in
// the hole we just created. It's important
// to do this as-we-go and not at the end of
// the loop, because if we hit an aborting
// exception in one of the s.writeDocument
// calls (above), it leaves this array in an
// inconsistent state:
waitingThreadStates[i] = waitingThreadStates[numWaiting - 1];
numWaiting--;
}
else
{
System.Diagnostics.Debug.Assert(!s.isIdle);
i++;
}
}
}
}
}
else
{
// Another thread got a docID before me, but, it
// hasn't finished its processing. So add myself to
// the line but don't hold up this thread.
waitingThreadStates[numWaiting++] = state;
}
}
}
internal long GetRAMUsed()
{
return numBytesUsed;
}
internal long numBytesAlloc;
internal long numBytesUsed;
internal System.Globalization.NumberFormatInfo nf = System.Globalization.CultureInfo.CurrentCulture.NumberFormat;
/* Used only when writing norms to fill in default norm
* value into the holes in docID stream for those docs
* that didn't have this field. */
internal static void FillBytes(IndexOutput out_Renamed, byte b, int numBytes)
{
for (int i = 0; i < numBytes; i++)
out_Renamed.WriteByte(b);
}
internal byte[] copyByteBuffer = new byte[4096];
/// <summary>Copy numBytes from srcIn to destIn </summary>
internal void CopyBytes(IndexInput srcIn, IndexOutput destIn, long numBytes)
{
// TODO: we could do this more efficiently (save a copy)
// because it's always from a ByteSliceReader ->
// IndexOutput
while (numBytes > 0)
{
int chunk;
if (numBytes > 4096)
chunk = 4096;
else
chunk = (int) numBytes;
srcIn.ReadBytes(copyByteBuffer, 0, chunk);
destIn.WriteBytes(copyByteBuffer, 0, chunk);
numBytes -= chunk;
}
}
/* Stores norms, buffered in RAM, until they are flushed
* to a partial segment. */
private class BufferedNorms
{
internal RAMOutputStream out_Renamed;
internal int upto;
internal BufferedNorms()
{
out_Renamed = new RAMOutputStream();
}
internal void Add(float norm)
{
byte b = Similarity.EncodeNorm(norm);
out_Renamed.WriteByte(b);
upto++;
}
internal void Reset()
{
out_Renamed.Reset();
upto = 0;
}
internal void Fill(int docID)
{
// Must now fill in docs that didn't have this
// field. Note that this is how norms can consume
// tremendous storage when the docs have widely
// varying different fields, because we are not
// storing the norms sparsely (see LUCENE-830)
if (upto < docID)
{
Lucene.Net.Index.DocumentsWriter.FillBytes(out_Renamed, Lucene.Net.Index.DocumentsWriter.defaultNorm, docID - upto);
upto = docID;
}
}
}
/* Simple StringReader that can be reset to a new string;
* we use this when tokenizing the string value from a
* Field. */
sealed internal class ReusableStringReader : System.IO.StringReader
{
internal ReusableStringReader() : base("")
{
}
internal int upto;
internal int left;
internal System.String s;
internal void Init(System.String s)
{
this.s = s;
left = s.Length;
this.upto = 0;
}
public int Read(char[] c)
{
return Read(c, 0, c.Length);
}
public override int Read(System.Char[] c, int off, int len)
{
if (left > len)
{
DocumentsWriter.GetCharsFromString(s, upto, upto + len, c, off);
upto += len;
left -= len;
return len;
}
else if (0 == left)
{
return - 1;
}
else
{
DocumentsWriter.GetCharsFromString(s, upto, upto + left, c, off);
int r = left;
left = 0;
upto = s.Length;
return r;
}
}
public override void Close()
{
}
}
/* IndexInput that knows how to read the byte slices written
* by Posting and PostingVector. We read the bytes in
* each slice until we hit the end of that slice at which
* point we read the forwarding address of the next slice
* and then jump to it.*/
sealed internal class ByteSliceReader:IndexInput
{
internal ByteBlockPool pool;
internal int bufferUpto;
internal byte[] buffer;
public int upto;
internal int limit;
internal int level;
public int bufferOffset;
public int endIndex;
public void Init(ByteBlockPool pool, int startIndex, int endIndex)
{
System.Diagnostics.Debug.Assert(endIndex - startIndex > 0);
this.pool = pool;
this.endIndex = endIndex;
level = 0;
bufferUpto = startIndex / Lucene.Net.Index.DocumentsWriter.BYTE_BLOCK_SIZE;
bufferOffset = bufferUpto * Lucene.Net.Index.DocumentsWriter.BYTE_BLOCK_SIZE;
buffer = pool.buffers[bufferUpto];
upto = startIndex & Lucene.Net.Index.DocumentsWriter.BYTE_BLOCK_MASK;
int firstSize = Lucene.Net.Index.DocumentsWriter.levelSizeArray[0];
if (startIndex + firstSize >= endIndex)
{
// There is only this one slice to read
limit = endIndex & Lucene.Net.Index.DocumentsWriter.BYTE_BLOCK_MASK;
}
else
limit = upto + firstSize - 4;
}
public override byte ReadByte()
{
// Assert that we are not @ EOF
System.Diagnostics.Debug.Assert(upto + bufferOffset < endIndex);
if (upto == limit)
NextSlice();
return buffer[upto++];
}
public long WriteTo(IndexOutput out_Renamed)
{
long size = 0;
while (true)
{
if (limit + bufferOffset == endIndex)
{
System.Diagnostics.Debug.Assert(endIndex - bufferOffset >= upto);
out_Renamed.WriteBytes(buffer, upto, limit - upto);
size += limit - upto;
break;
}
else
{
out_Renamed.WriteBytes(buffer, upto, limit - upto);
size += limit - upto;
NextSlice();
}
}
return size;
}
public void NextSlice()
{
// Skip to our next slice
int nextIndex = ((buffer[limit] & 0xff) << 24) + ((buffer[1 + limit] & 0xff) << 16) + ((buffer[2 + limit] & 0xff) << 8) + (buffer[3 + limit] & 0xff);
level = Lucene.Net.Index.DocumentsWriter.nextLevelArray[level];
int newSize = Lucene.Net.Index.DocumentsWriter.levelSizeArray[level];
bufferUpto = nextIndex / Lucene.Net.Index.DocumentsWriter.BYTE_BLOCK_SIZE;
bufferOffset = bufferUpto * Lucene.Net.Index.DocumentsWriter.BYTE_BLOCK_SIZE;
buffer = pool.buffers[bufferUpto];
upto = nextIndex & Lucene.Net.Index.DocumentsWriter.BYTE_BLOCK_MASK;
if (nextIndex + newSize >= endIndex)
{
// We are advancing to the final slice
System.Diagnostics.Debug.Assert(endIndex - nextIndex > 0);
limit = endIndex - bufferOffset;
}
else
{
// This is not the final slice (subtract 4 for the
// forwarding address at the end of this new slice)
limit = upto + newSize - 4;
}
}
public override void ReadBytes(byte[] b, int offset, int len)
{
while (len > 0)
{
int numLeft = limit - upto;
if (numLeft < len)
{
// Read entire slice
Array.Copy(buffer, upto, b, offset, numLeft);
offset += numLeft;
len -= numLeft;
NextSlice();
}
else
{
// This slice is the last one
Array.Copy(buffer, upto, b, offset, len);
upto += len;
break;
}
}
}
public override long GetFilePointer()
{
throw new System.SystemException("not implemented");
}
public override long Length()
{
throw new System.SystemException("not implemented");
}
public override void Seek(long pos)
{
throw new System.SystemException("not implemented");
}
public override void Close()
{
throw new System.SystemException("not implemented");
}
override public System.Object Clone()
{
return null;
}
}
// Size of each slice. These arrays should be at most 16
// elements. First array is just a compact way to encode
// X+1 with a max. Second array is the length of each
// slice, ie first slice is 5 bytes, next slice is 14
// bytes, etc.
internal static readonly int[] nextLevelArray = new int[]{1, 2, 3, 4, 5, 6, 7, 8, 9, 9};
internal static readonly int[] levelSizeArray = new int[]{5, 14, 20, 30, 40, 40, 80, 80, 120, 200};
/* Class that Posting and PostingVector use to write byte
* streams into shared fixed-size byte[] arrays. The idea
* is to allocate slices of increasing lengths For
* example, the first slice is 5 bytes, the next slice is
* 14, etc. We start by writing our bytes into the first
* 5 bytes. When we hit the end of the slice, we allocate
* the next slice and then write the address of the new
* slice into the last 4 bytes of the previous slice (the
* "forwarding address").
*
* Each slice is filled with 0's initially, and we mark
* the end with a non-zero byte. This way the methods
* that are writing into the slice don't need to record
* its length and instead allocate a new slice once they
* hit a non-zero byte. */
sealed internal class ByteBlockPool
{
public ByteBlockPool(DocumentsWriter enclosingInstance)
{
InitBlock(enclosingInstance);
}
private void InitBlock(DocumentsWriter enclosingInstance)
{
this.enclosingInstance = enclosingInstance;
byteUpto = Lucene.Net.Index.DocumentsWriter.BYTE_BLOCK_SIZE;
}
private DocumentsWriter enclosingInstance;
public DocumentsWriter Enclosing_Instance
{
get
{
return enclosingInstance;
}
}
public byte[][] buffers = new byte[10][];
internal int bufferUpto = - 1; // Which buffer we are upto
public int byteUpto; // Where we are in head buffer
public byte[] buffer; // Current head buffer
public int byteOffset = - Lucene.Net.Index.DocumentsWriter.BYTE_BLOCK_SIZE; // Current head offset
public void Reset()
{
if (bufferUpto != - 1)
{
// We allocated at least one buffer
for (int i = 0; i < bufferUpto; i++)
// Fully zero fill buffers that we fully used
Array.Clear(buffers[i], 0, buffers.Length);
// Partial zero fill the final buffer
Array.Clear(buffers[bufferUpto], 0, byteUpto);
if (bufferUpto > 0)
// Recycle all but the first buffer
Enclosing_Instance.RecycleByteBlocks(buffers, 1, 1 + bufferUpto);
// Re-use the first buffer
bufferUpto = 0;
byteUpto = 0;
byteOffset = 0;
buffer = buffers[0];
}
}
public void NextBuffer()
{
if (1 + bufferUpto == buffers.Length)
{
byte[][] newBuffers = new byte[(int) (buffers.Length * 1.5)][];
Array.Copy(buffers, 0, newBuffers, 0, buffers.Length);
buffers = newBuffers;
}
buffer = buffers[1 + bufferUpto] = Enclosing_Instance.GetByteBlock();
bufferUpto++;
byteUpto = 0;
byteOffset += Lucene.Net.Index.DocumentsWriter.BYTE_BLOCK_SIZE;
}
public int NewSlice(int size)
{
if (byteUpto > Lucene.Net.Index.DocumentsWriter.BYTE_BLOCK_SIZE - size)
NextBuffer();
int upto = byteUpto;
byteUpto += size;
buffer[byteUpto - 1] = 16;
return upto;
}
public int AllocSlice(byte[] slice, int upto)
{
try
{
int level = slice[upto] & 15;
int newLevel = Lucene.Net.Index.DocumentsWriter.nextLevelArray[level];
int newSize = Lucene.Net.Index.DocumentsWriter.levelSizeArray[newLevel];
// Maybe allocate another block
if (byteUpto > Lucene.Net.Index.DocumentsWriter.BYTE_BLOCK_SIZE - newSize)
NextBuffer();
int newUpto = byteUpto;
int offset = newUpto + byteOffset;
byteUpto += newSize;
// Copy forward the past 3 bytes (which we are about
// to overwrite with the forwarding address):
buffer[newUpto] = slice[upto - 3];
buffer[newUpto + 1] = slice[upto - 2];
buffer[newUpto + 2] = slice[upto - 1];
// Write forwarding address at end of last slice:
slice[upto - 3] = (byte)(SupportClass.Number.URShift(offset, 24));
slice[upto - 2] = (byte)(SupportClass.Number.URShift(offset, 16));
slice[upto - 1] = (byte)(SupportClass.Number.URShift(offset, 8));
slice[upto] = (byte)offset;
// Write new level:
buffer[byteUpto - 1] = (byte)(16 | newLevel);
return newUpto + 3;
}
catch { throw; }
}
}
sealed internal class CharBlockPool
{
public CharBlockPool(DocumentsWriter enclosingInstance)
{
InitBlock(enclosingInstance);
}
private void InitBlock(DocumentsWriter enclosingInstance)
{
this.enclosingInstance = enclosingInstance;
byteUpto = Lucene.Net.Index.DocumentsWriter.CHAR_BLOCK_SIZE;
}
private DocumentsWriter enclosingInstance;
public DocumentsWriter Enclosing_Instance
{
get
{
return enclosingInstance;
}
}
public char[][] buffers = new char[10][];
internal int bufferUpto = - 1; // Which buffer we are upto
public int byteUpto; // Where we are in head buffer
public char[] buffer; // Current head buffer
public int byteOffset = - Lucene.Net.Index.DocumentsWriter.CHAR_BLOCK_SIZE; // Current head offset
public void Reset()
{
Enclosing_Instance.RecycleCharBlocks(buffers, 1 + bufferUpto);
bufferUpto = - 1;
byteUpto = Lucene.Net.Index.DocumentsWriter.CHAR_BLOCK_SIZE;
byteOffset = - Lucene.Net.Index.DocumentsWriter.CHAR_BLOCK_SIZE;
}
public void NextBuffer()
{
if (1 + bufferUpto == buffers.Length)
{
char[][] newBuffers = new char[(int) (buffers.Length * 1.5)][];
Array.Copy(buffers, 0, newBuffers, 0, buffers.Length);
buffers = newBuffers;
}
buffer = buffers[1 + bufferUpto] = Enclosing_Instance.GetCharBlock();
bufferUpto++;
byteUpto = 0;
byteOffset += Lucene.Net.Index.DocumentsWriter.CHAR_BLOCK_SIZE;
}
}
// Used only when infoStream != null
private long SegmentSize(System.String segmentName)
{
System.Diagnostics.Debug.Assert(infoStream != null);
long size = directory.FileLength(segmentName + ".tii") + directory.FileLength(segmentName + ".tis") + directory.FileLength(segmentName + ".frq") + directory.FileLength(segmentName + ".prx");
System.String normFileName = segmentName + ".nrm";
if (directory.FileExists(normFileName))
size += directory.FileLength(normFileName);
return size;
}
private const int POINTER_NUM_BYTE = 4;
private const int INT_NUM_BYTE = 4;
private const int CHAR_NUM_BYTE = 2;
// Why + 5*POINTER_NUM_BYTE below?
// 1: Posting has "vector" field which is a pointer
// 2: Posting is referenced by postingsFreeList array
// 3,4,5: Posting is referenced by postings hash, which
// targets 25-50% fill factor; approximate this
// as 3X # pointers
internal static readonly int POSTING_NUM_BYTE = OBJECT_HEADER_BYTES + 9 * INT_NUM_BYTE + 5 * POINTER_NUM_BYTE;
// Holds free pool of Posting instances
private Posting[] postingsFreeList;
private int postingsFreeCount;
private int postingsAllocCount;
/* Allocate more Postings from shared pool */
internal void GetPostings(Posting[] postings)
{
lock (this)
{
numBytesUsed += postings.Length * POSTING_NUM_BYTE;
int numToCopy;
if (postingsFreeCount < postings.Length)
numToCopy = postingsFreeCount;
else
numToCopy = postings.Length;
int start = postingsFreeCount - numToCopy;
Array.Copy(postingsFreeList, start, postings, 0, numToCopy);
postingsFreeCount -= numToCopy;
// Directly allocate the remainder if any
if (numToCopy < postings.Length)
{
int extra = postings.Length - numToCopy;
int newPostingsAllocCount = postingsAllocCount + extra;
if (newPostingsAllocCount > postingsFreeList.Length)
{
postingsFreeList = new Posting[(int) (1.25 * newPostingsAllocCount)];
}
BalanceRAM();
for (int i = numToCopy; i < postings.Length; i++)
{
postings[i] = new Posting();
numBytesAlloc += POSTING_NUM_BYTE;
postingsAllocCount++;
}
}
}
}
internal void RecyclePostings(Posting[] postings, int numPostings)
{
lock (this)
{
// Move all Postings from this ThreadState back to our
// free list. We pre-allocated this array while we were
// creating Postings to make sure it's large enough
System.Diagnostics.Debug.Assert(postingsFreeCount + numPostings <= postingsFreeList.Length);
Array.Copy(postings, 0, postingsFreeList, postingsFreeCount, numPostings);
postingsFreeCount += numPostings;
}
}
/* Initial chunks size of the shared byte[] blocks used to
store postings data */
internal const int BYTE_BLOCK_SHIFT = 15;
internal static readonly int BYTE_BLOCK_SIZE = (int) System.Math.Pow(2.0, BYTE_BLOCK_SHIFT);
internal static readonly int BYTE_BLOCK_MASK = BYTE_BLOCK_SIZE - 1;
internal static readonly int BYTE_BLOCK_NOT_MASK = ~ BYTE_BLOCK_MASK;
private System.Collections.ArrayList freeByteBlocks = new System.Collections.ArrayList();
/* Allocate another byte[] from the shared pool */
internal byte[] GetByteBlock()
{
lock (this)
{
int size = freeByteBlocks.Count;
byte[] b;
if (0 == size)
{
numBytesAlloc += BYTE_BLOCK_SIZE;
BalanceRAM();
b = new byte[BYTE_BLOCK_SIZE];
}
else
{
System.Object tempObject;
tempObject = freeByteBlocks[size - 1];
freeByteBlocks.RemoveAt(size - 1);
b = (byte[]) tempObject;
}
numBytesUsed += BYTE_BLOCK_SIZE;
return b;
}
}
/* Return a byte[] to the pool */
internal void RecycleByteBlocks(byte[][] blocks, int start, int end)
{
lock (this)
{
for (int i = start; i < end; i++)
freeByteBlocks.Add(blocks[i]);
}
}
/* Initial chunk size of the shared char[] blocks used to
store term text */
internal const int CHAR_BLOCK_SHIFT = 14;
internal static readonly int CHAR_BLOCK_SIZE = (int) System.Math.Pow(2.0, CHAR_BLOCK_SHIFT);
internal static readonly int CHAR_BLOCK_MASK = CHAR_BLOCK_SIZE - 1;
internal static readonly int MAX_TERM_LENGTH = CHAR_BLOCK_SIZE - 1;
private System.Collections.ArrayList freeCharBlocks = new System.Collections.ArrayList();
/* Allocate another char[] from the shared pool */
internal char[] GetCharBlock()
{
lock (this)
{
int size = freeCharBlocks.Count;
char[] c;
if (0 == size)
{
numBytesAlloc += CHAR_BLOCK_SIZE * CHAR_NUM_BYTE;
BalanceRAM();
c = new char[CHAR_BLOCK_SIZE];
}
else
{
System.Object tempObject;
tempObject = freeCharBlocks[size - 1];
freeCharBlocks.RemoveAt(size - 1);
c = (char[]) tempObject;
}
numBytesUsed += CHAR_BLOCK_SIZE * CHAR_NUM_BYTE;
return c;
}
}
/* Return a char[] to the pool */
internal void RecycleCharBlocks(char[][] blocks, int numBlocks)
{
lock (this)
{
for (int i = 0; i < numBlocks; i++)
freeCharBlocks.Add(blocks[i]);
}
}
internal System.String ToMB(long v)
{
return String.Format(nf, "{0:f}", new Object[] { (v / 1024.0 / 1024.0) });
}
/* We have three pools of RAM: Postings, byte blocks
* (holds freq/prox posting data) and char blocks (holds
* characters in the term). Different docs require
* varying amount of storage from these three classes.
* For example, docs with many unique single-occurrence
* short terms will use up the Postings RAM and hardly any
* of the other two. Whereas docs with very large terms
* will use alot of char blocks RAM and relatively less of
* the other two. This method just frees allocations from
* the pools once we are over-budget, which balances the
* pools to match the current docs. */
private void BalanceRAM()
{
lock (this)
{
if (ramBufferSize == IndexWriter.DISABLE_AUTO_FLUSH || bufferIsFull)
return ;
// We free our allocations if we've allocated 5% over
// our allowed RAM buffer
long freeTrigger = (long) (1.05 * ramBufferSize);
long freeLevel = (long) (0.95 * ramBufferSize);
// We flush when we've used our target usage
long flushTrigger = (long) ramBufferSize;
if (numBytesAlloc > freeTrigger)
{
if (infoStream != null)
infoStream.WriteLine(" RAM: now balance allocations: usedMB=" + ToMB(numBytesUsed) + " vs trigger=" + ToMB(flushTrigger) + " allocMB=" + ToMB(numBytesAlloc) + " vs trigger=" + ToMB(freeTrigger) + " postingsFree=" + ToMB(postingsFreeCount * POSTING_NUM_BYTE) + " byteBlockFree=" + ToMB(freeByteBlocks.Count * BYTE_BLOCK_SIZE) + " charBlockFree=" + ToMB(freeCharBlocks.Count * CHAR_BLOCK_SIZE * CHAR_NUM_BYTE));
// When we've crossed 100% of our target Postings
// RAM usage, try to free up until we're back down
// to 95%
long startBytesAlloc = numBytesAlloc;
int postingsFreeChunk = (int) (BYTE_BLOCK_SIZE / POSTING_NUM_BYTE);
int iter = 0;
// We free equally from each pool in 64 KB
// chunks until we are below our threshold
// (freeLevel)
while (numBytesAlloc > freeLevel)
{
if (0 == freeByteBlocks.Count && 0 == freeCharBlocks.Count && 0 == postingsFreeCount)
{
// Nothing else to free -- must flush now.
bufferIsFull = true;
if (infoStream != null)
infoStream.WriteLine(" nothing to free; now set bufferIsFull");
break;
}
if ((0 == iter % 3) && freeByteBlocks.Count > 0)
{
freeByteBlocks.RemoveAt(freeByteBlocks.Count - 1);
numBytesAlloc -= BYTE_BLOCK_SIZE;
}
if ((1 == iter % 3) && freeCharBlocks.Count > 0)
{
freeCharBlocks.RemoveAt(freeCharBlocks.Count - 1);
numBytesAlloc -= CHAR_BLOCK_SIZE * CHAR_NUM_BYTE;
}
if ((2 == iter % 3) && postingsFreeCount > 0)
{
int numToFree;
if (postingsFreeCount >= postingsFreeChunk)
numToFree = postingsFreeChunk;
else
numToFree = postingsFreeCount;
Array.Clear(postingsFreeList, postingsFreeCount - numToFree, numToFree);
postingsFreeCount -= numToFree;
postingsAllocCount -= numToFree;
numBytesAlloc -= numToFree * POSTING_NUM_BYTE;
}
iter++;
}
if (infoStream != null)
infoStream.WriteLine(String.Format(" after free: freedMB={0:f} usedMB={1:f} allocMB={2:f}",
new Object[] { ((startBytesAlloc - numBytesAlloc) / 1024.0 / 1024.0), (numBytesUsed / 1024.0 / 1024.0), (numBytesAlloc / 1024.0 / 1024.0) }));
}
else
{
// If we have not crossed the 100% mark, but have
// crossed the 95% mark of RAM we are actually
// using, go ahead and flush. This prevents
// over-allocating and then freeing, with every
// flush.
if (numBytesUsed > flushTrigger)
{
if (infoStream != null)
infoStream.WriteLine(String.Format(nf, " RAM: now flush @ usedMB={0:f} allocMB={1:f} triggerMB={2:f}",
new Object[] { (numBytesUsed / 1024.0 / 1024.0), (numBytesAlloc / 1024.0 / 1024.0), (flushTrigger / 1024.0 / 1024.0) }));
bufferIsFull = true;
}
}
}
}
/* Used to track postings for a single term. One of these
* exists per unique term seen since the last flush. */
sealed internal class Posting
{
internal int textStart; // Address into char[] blocks where our text is stored
internal int docFreq; // # times this term occurs in the current doc
internal int freqStart; // Address of first byte[] slice for freq
internal int freqUpto; // Next write address for freq
internal int proxStart; // Address of first byte[] slice
internal int proxUpto; // Next write address for prox
internal int lastDocID; // Last docID where this term occurred
internal int lastDocCode; // Code for prior doc
internal int lastPosition; // Last position where this term occurred
internal PostingVector vector; // Corresponding PostingVector instance
}
/* Used to track data for term vectors. One of these
* exists per unique term seen in each field in the
* document. */
sealed internal class PostingVector
{
internal Posting p; // Corresponding Posting instance for this term
internal int lastOffset; // Last offset we saw
internal int offsetStart; // Address of first slice for offsets
internal int offsetUpto; // Next write address for offsets
internal int posStart; // Address of first slice for positions
internal int posUpto; // Next write address for positions
}
static DocumentsWriter()
{
defaultNorm = Similarity.EncodeNorm(1.0f);
}
/// <summary>
/// Copies an array of chars obtained from a String into a specified array of chars
/// </summary>
/// <param name="sourceString">The String to get the chars from</param>
/// <param name="sourceStart">Position of the String to start getting the chars</param>
/// <param name="sourceEnd">Position of the String to end getting the chars</param>
/// <param name="destinationArray">Array to return the chars</param>
/// <param name="destinationStart">Position of the destination array of chars to start storing the chars</param>
/// <returns>An array of chars</returns>
static internal void GetCharsFromString(System.String sourceString, int sourceStart, int sourceEnd, char[] destinationArray, int destinationStart)
{
int sourceCounter;
int destinationCounter;
sourceCounter = sourceStart;
destinationCounter = destinationStart;
while (sourceCounter < sourceEnd)
{
destinationArray[destinationCounter] = (char)sourceString[sourceCounter];
sourceCounter++;
destinationCounter++;
}
}
}
// Used only internally to DW to call abort "up the stack"
[Serializable]
class AbortException : System.IO.IOException
{
public AbortException(System.Exception cause, DocumentsWriter docWriter) : base("", cause)
{
docWriter.SetAborting();
}
}
}