Merge "Enable public access to SimilarityIndex scoring function"
This commit is contained in:
commit
2ad2d85bcd
|
@ -63,10 +63,13 @@
|
||||||
* will not exceed 1 MiB per instance. The index starts out at a smaller size
|
* will not exceed 1 MiB per instance. The index starts out at a smaller size
|
||||||
* (closer to 2 KiB), but may grow as more distinct blocks within the scanned
|
* (closer to 2 KiB), but may grow as more distinct blocks within the scanned
|
||||||
* file are discovered.
|
* file are discovered.
|
||||||
|
*
|
||||||
|
* @since 4.0
|
||||||
*/
|
*/
|
||||||
class SimilarityIndex {
|
public class SimilarityIndex {
|
||||||
/** A special {@link TableFullException} used in place of OutOfMemoryError. */
|
/** A special {@link TableFullException} used in place of OutOfMemoryError. */
|
||||||
private static final TableFullException TABLE_FULL_OUT_OF_MEMORY = new TableFullException();
|
public static final TableFullException
|
||||||
|
TABLE_FULL_OUT_OF_MEMORY = new TableFullException();
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Shift to apply before storing a key.
|
* Shift to apply before storing a key.
|
||||||
|
@ -105,6 +108,26 @@ class SimilarityIndex {
|
||||||
/** {@code idHash.length == 1 << idHashBits}. */
|
/** {@code idHash.length == 1 << idHashBits}. */
|
||||||
private int idHashBits;
|
private int idHashBits;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Create a new similarity index for the given object
|
||||||
|
*
|
||||||
|
* @param obj
|
||||||
|
* the object to hash
|
||||||
|
* @return similarity index for this object
|
||||||
|
* @throws IOException
|
||||||
|
* file contents cannot be read from the repository.
|
||||||
|
* @throws TableFullException
|
||||||
|
* object hashing overflowed the storage capacity of the
|
||||||
|
* SimilarityIndex.
|
||||||
|
*/
|
||||||
|
public static SimilarityIndex create(ObjectLoader obj) throws IOException,
|
||||||
|
TableFullException {
|
||||||
|
SimilarityIndex idx = new SimilarityIndex();
|
||||||
|
idx.hash(obj);
|
||||||
|
idx.sort();
|
||||||
|
return idx;
|
||||||
|
}
|
||||||
|
|
||||||
SimilarityIndex() {
|
SimilarityIndex() {
|
||||||
idHashBits = 8;
|
idHashBits = 8;
|
||||||
idHash = new long[1 << idHashBits];
|
idHash = new long[1 << idHashBits];
|
||||||
|
@ -212,7 +235,27 @@ void sort() {
|
||||||
Arrays.sort(idHash);
|
Arrays.sort(idHash);
|
||||||
}
|
}
|
||||||
|
|
||||||
int score(SimilarityIndex dst, int maxScore) {
|
/**
|
||||||
|
* Compute the similarity score between this index and another.
|
||||||
|
* <p>
|
||||||
|
* A region of a file is defined as a line in a text file or a fixed-size
|
||||||
|
* block in a binary file. To prepare an index, each region in the file is
|
||||||
|
* hashed; the values and counts of hashes are retained in a sorted table.
|
||||||
|
* Define the similarity fraction F as the the count of matching regions
|
||||||
|
* between the two files divided between the maximum count of regions in
|
||||||
|
* either file. The similarity score is F multiplied by the maxScore
|
||||||
|
* constant, yielding a range [0, maxScore]. It is defined as maxScore for
|
||||||
|
* the degenerate case of two empty files.
|
||||||
|
* <p>
|
||||||
|
* The similarity score is symmetrical; i.e. a.score(b) == b.score(a).
|
||||||
|
*
|
||||||
|
* @param dst
|
||||||
|
* the other index
|
||||||
|
* @param maxScore
|
||||||
|
* the score representing a 100% match
|
||||||
|
* @return the similarity score
|
||||||
|
*/
|
||||||
|
public int score(SimilarityIndex dst, int maxScore) {
|
||||||
long max = Math.max(hashedCnt, dst.hashedCnt);
|
long max = Math.max(hashedCnt, dst.hashedCnt);
|
||||||
if (max == 0)
|
if (max == 0)
|
||||||
return maxScore;
|
return maxScore;
|
||||||
|
@ -381,7 +424,8 @@ private static long countOf(long v) {
|
||||||
return v & MAX_COUNT;
|
return v & MAX_COUNT;
|
||||||
}
|
}
|
||||||
|
|
||||||
static class TableFullException extends Exception {
|
/** Thrown by {@code create()} when file is too large. */
|
||||||
|
public static class TableFullException extends Exception {
|
||||||
private static final long serialVersionUID = 1L;
|
private static final long serialVersionUID = 1L;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue