diff options
Diffstat (limited to 'Algorithms/Part-II/1-WordNet/WordNet.java')
-rw-r--r-- | Algorithms/Part-II/1-WordNet/WordNet.java | 155 |
1 files changed, 140 insertions, 15 deletions
diff --git a/Algorithms/Part-II/1-WordNet/WordNet.java b/Algorithms/Part-II/1-WordNet/WordNet.java index 830ccdf..2a854fa 100644 --- a/Algorithms/Part-II/1-WordNet/WordNet.java +++ b/Algorithms/Part-II/1-WordNet/WordNet.java @@ -1,58 +1,183 @@ /* vim: set expandtab tabstop=4 shiftwidth=4 : */ +import java.util.HashMap; +import java.util.Iterator; + public class WordNet { // data type space linear in the input size + private HashMap<String, Bag<Integer>> wordsToIds; + private HashMap<Integer, String> idToString; + private Digraph dag; + + private SAP sap; // constructor takes the name of the two input files + // time linearithmic in the input size public WordNet(String synsets, String hypernyms) { - // time linearithmic in the input size + int size = 0; + wordsToIds = new HashMap<String, Bag<Integer>>(); + idToString = new HashMap<Integer, String>(); + + // load words from synsets + In syn = new In(synsets); + while (syn.hasNextLine()) { + String line = syn.readLine(); + String[] fields = line.split(","); + int id = Integer.parseInt(fields[0]); + String[] words = fields[1].split(" "); + for (String word : words) { + Bag<Integer> bag = wordsToIds.get(word); + if (bag == null) { + bag = new Bag<Integer>(); + bag.add(id); + wordsToIds.put(word, bag); + } else { + bag.add(id); + } + } + idToString.put(id, fields[1]); + size++; + } + syn.close(); + + dag = new Digraph(size); + + boolean[] roots = new boolean[dag.V()]; + for (int v = 0; v < dag.V(); v++) + roots[v] = true; - // throw a java.lang.IllegalArgumentException - // if the input does not correspond to a rooted DAG + // load edges from hypernyms + In hyp = new In(hypernyms); + while (hyp.hasNextLine()) { + String line = hyp.readLine(); + String[] fields = line.split(","); + int id = Integer.parseInt(fields[0]); + if (fields.length > 1) roots[id] = false; + for (int i = 1; i < fields.length; i++) { + dag.addEdge(id, Integer.parseInt(fields[i])); + } + } + hyp.close(); + + int rootsCount = 0; + for (int v = 0; v < dag.V(); v++) + if (roots[v]) rootsCount++; + if (rootsCount != 1) + throw new IllegalArgumentException("roots "+rootsCount); + + checkIndegrees(); + + sap = new SAP(dag); + } + + private void checkIndegrees() + { + int[] indegree = new int[dag.V()]; + + // compute indegrees + for (int v = 0; v < dag.V(); v++) { + for (int w : dag.adj(v)) { + indegree[w]++; + } + } + + // initialize queue to contain all vertices with indegree = 0 (leaves) + Queue<Integer> queue = new Queue<Integer>(); + for (int v = 0; v < dag.V(); v++) + if (indegree[v] == 0) queue.enqueue(v); + + // BFS and decrease indegrees + for (int j = 0; !queue.isEmpty(); j++) { + int v = queue.dequeue(); + for (int w : dag.adj(v)) { + indegree[w]--; + if (indegree[w] == 0) queue.enqueue(w); + } + } + + // check that all indegrees are 0 + for (int v = 0; v < indegree.length; v++) { + if (indegree[v] != 0) + throw new IllegalArgumentException("indegrees"); + } } // the set of nouns (no duplicates), returned as an Iterable public Iterable<String> nouns() { - return null; + if (wordsToIds == null) return null; + return wordsToIds.keySet(); } // is the word a WordNet noun? public boolean isNoun(String word) { - // run in time logarithmic in the number of nouns - - return false; + if (wordsToIds == null) return false; + return wordsToIds.containsKey(word); } // distance between nounA and nounB (defined below) public int distance(String nounA, String nounB) { - // run in time linear in the size of the WordNet digraph + if (!isNoun(nounA) || !isNoun(nounB)) + throw new IllegalArgumentException(); + + Iterable<Integer> it0 = nounIterable(nounA); + Iterable<Integer> it1 = nounIterable(nounB); - // throw java.lang.IllegalArgumentException - // unless both of the noun arguments are WordNet nouns + return sap.length(it0, it1); + } - return 0; + private Iterable<Integer> nounIterable(final String noun) { + return new Iterable<Integer>() { + public Iterator<Integer> iterator() { + return wordsToIds.get(noun).iterator(); + } + }; } // a synset that is the common ancestor of nounA and nounB // in a shortest ancestral path public String sap(String nounA, String nounB) { - // run in time linear in the size of the WordNet digraph + if (!isNoun(nounA) || !isNoun(nounB)) + throw new java.lang.IllegalArgumentException(); - // throw java.lang.IllegalArgumentException - // unless both of the noun arguments are WordNet nouns + Iterable<Integer> it0 = nounIterable(nounA); + Iterable<Integer> it1 = nounIterable(nounB); - return ""; + return idToString.get(sap.ancestor(it0, it1)); } // for unit testing of this class public static void main(String[] args) { + WordNet wordNet = new WordNet("./data/synsets.txt", "./data/hypernyms.txt"); + + StdOut.println("23 white_marlin, mileage: " + + wordNet.distance("white_marlin", "mileage")); + StdOut.println("32 Black_Plague, black_marlin: " + + wordNet.distance("Black_Plague", "black_marlin")); + StdOut.println("32 American_water_spaniel, histology: " + + wordNet.distance("American_water_spaniel", "histology")); + StdOut.println("32 Brown_Swiss, barrel_roll: " + + wordNet.distance("Brown_Swiss", "barrel_roll")); + StdOut.println("bedspring Carolus_Linnaeus: " + + wordNet.sap("bedspring", "Carolus_Linnaeus")); + try { + StdOut.println("check roots"); + new WordNet("./data/synsets.txt", "./data/hypernymsInvalidTwoRoots.txt"); + StdOut.println("BAD"); + } + catch (IllegalArgumentException e) { StdOut.println(e.getMessage()); } + try { + StdOut.println("check cycles"); + new WordNet("./data/synsets.txt", "./data/hypernymsInvalidCycle.txt"); + StdOut.println("BAD"); + } + catch (IllegalArgumentException e) { StdOut.println(e.getMessage()); } } } |