001/*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements.  See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License.  You may obtain a copy of the License at
008 *
009 *      http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017package org.apache.commons.text.similarity;
018
019import java.util.Collection;
020import java.util.HashMap;
021import java.util.Map;
022import java.util.Map.Entry;
023import java.util.Set;
024import java.util.function.Function;
025
026/**
027 * Measures the intersection of two sets created from a pair of character sequences.
028 *
029 * <p>It is assumed that the type {@code T} correctly conforms to the requirements for storage
030 * within a {@link Set} or {@link HashMap}. Ideally the type is immutable and implements
031 * {@link Object#equals(Object)} and {@link Object#hashCode()}.</p>
032 *
033 * @param <T> the type of the elements extracted from the character sequence
034 * @since 1.7
035 * @see Set
036 * @see HashMap
037 */
038public class IntersectionSimilarity<T> implements SimilarityScore<IntersectionResult> {
039    /** The converter used to create the elements from the characters. */
040    private final Function<CharSequence, Collection<T>> converter;
041
042    // The following is adapted from commons-collections for a Bag.
043    // A Bag is a collection that can store the count of the number
044    // of copies of each element.
045
046    /**
047     * Mutable counter class for storing the count of elements.
048     */
049    private static class BagCount {
050        /** The count. This is initialised to 1 upon construction. */
051        int count = 1;
052    }
053
054    /**
055     * A minimal implementation of a Bag that can store elements and a count.
056     *
057     * <p>For the intended purpose the Bag does not have to be a {@link Collection}. It does not
058     * even have to know its own size.
059     */
060    private class TinyBag {
061        /** The backing map. */
062        private final Map<T, BagCount> map;
063
064        /**
065         * Create a new tiny bag.
066         *
067         * @param initialCapacity the initial capacity
068         */
069        TinyBag(final int initialCapacity) {
070            map = new HashMap<>(initialCapacity);
071        }
072
073        /**
074         * Adds a new element to the bag, incrementing its count in the underlying map.
075         *
076         * @param object the object to add
077         */
078        void add(final T object) {
079            final BagCount mut = map.get(object);
080            if (mut == null) {
081                map.put(object, new BagCount());
082            } else {
083                mut.count++;
084            }
085        }
086
087        /**
088         * Returns the number of occurrence of the given element in this bag by
089         * looking up its count in the underlying map.
090         *
091         * @param object the object to search for
092         * @return The number of occurrences of the object, zero if not found
093         */
094        int getCount(final Object object) {
095            final BagCount count = map.get(object);
096            if (count != null) {
097                return count.count;
098            }
099            return 0;
100        }
101
102        /**
103         * Returns a Set view of the mappings contained in this bag.
104         *
105         * @return The Set view
106         */
107        Set<Entry<T, BagCount>> entrySet() {
108            return map.entrySet();
109        }
110
111        /**
112         * Get the number of unique elements in the bag.
113         *
114         * @return The unique element size
115         */
116        int uniqueElementSize() {
117            return map.size();
118        }
119    }
120
121    /**
122     * Create a new intersection similarity using the provided converter.
123     *
124     * <p>If the converter returns a {@link Set} then the intersection result will
125     * not include duplicates. Any other {@link Collection} is used to produce a result
126     * that will include duplicates in the intersect and union.
127     *
128     * @param converter the converter used to create the elements from the characters
129     * @throws IllegalArgumentException if the converter is null
130     */
131    public IntersectionSimilarity(final Function<CharSequence, Collection<T>> converter) {
132        if (converter == null) {
133            throw new IllegalArgumentException("Converter must not be null");
134        }
135        this.converter = converter;
136    }
137
138    /**
139     * Calculates the intersection of two character sequences passed as input.
140     *
141     * @param left first character sequence
142     * @param right second character sequence
143     * @return The intersection result
144     * @throws IllegalArgumentException if either input sequence is {@code null}
145     */
146    @Override
147    public IntersectionResult apply(final CharSequence left, final CharSequence right) {
148        if (left == null || right == null) {
149            throw new IllegalArgumentException("Input cannot be null");
150        }
151
152        // Create the elements from the sequences
153        final Collection<T> objectsA = converter.apply(left);
154        final Collection<T> objectsB = converter.apply(right);
155        final int sizeA = objectsA.size();
156        final int sizeB = objectsB.size();
157
158        // Short-cut if either collection is empty
159        if (Math.min(sizeA, sizeB) == 0) {
160            // No intersection
161            return new IntersectionResult(sizeA, sizeB, 0);
162        }
163
164        // Intersection = count the number of shared elements
165        int intersection;
166        if (objectsA instanceof Set && objectsB instanceof Set) {
167            // If a Set then the elements will only have a count of 1.
168            // Iterate over the smaller set.
169            intersection = (sizeA < sizeB)
170                    ? getIntersection((Set<T>) objectsA, (Set<T>) objectsB)
171                    : getIntersection((Set<T>) objectsB, (Set<T>) objectsA);
172        } else  {
173            // Create a bag for each collection
174            final TinyBag bagA = toBag(objectsA);
175            final TinyBag bagB = toBag(objectsB);
176            // Iterate over the smaller number of unique elements
177            intersection = (bagA.uniqueElementSize() < bagB.uniqueElementSize())
178                    ? getIntersection(bagA, bagB)
179                    : getIntersection(bagB, bagA);
180        }
181
182        return new IntersectionResult(sizeA, sizeB, intersection);
183    }
184
185    /**
186     * Convert the collection to a bag. The bag will contain the count of each element
187     * in the collection.
188     *
189     * @param objects the objects
190     * @return The bag
191     */
192    private TinyBag toBag(final Collection<T> objects) {
193        final TinyBag bag = new TinyBag(objects.size());
194        for (final T t : objects) {
195            bag.add(t);
196        }
197        return bag;
198    }
199
200    /**
201     * Compute the intersection between two sets. This is the count of all the elements
202     * that are within both sets.
203     *
204     * @param <T> the type of the elements in the set
205     * @param setA the set A
206     * @param setB the set B
207     * @return The intersection
208     */
209    private static <T> int getIntersection(final Set<T> setA, final Set<T> setB) {
210        int intersection = 0;
211        for (final T element : setA) {
212            if (setB.contains(element)) {
213                intersection++;
214            }
215        }
216        return intersection;
217    }
218
219    /**
220     * Compute the intersection between two bags. This is the sum of the minimum
221     * count of each element that is within both sets.
222     *
223     * @param bagA the bag A
224     * @param bagB the bag B
225     * @return The intersection
226     */
227    private int getIntersection(final TinyBag bagA, final TinyBag bagB) {
228        int intersection = 0;
229        for (final Entry<T, BagCount> entry : bagA.entrySet()) {
230            final T element = entry.getKey();
231            final int count = entry.getValue().count;
232            // The intersection of this entry in both bags is the minimum count
233            intersection += Math.min(count, bagB.getCount(element));
234        }
235        return intersection;
236    }
237}