aboutsummaryrefslogtreecommitdiff
path: root/src/com
diff options
context:
space:
mode:
Diffstat (limited to 'src/com')
-rw-r--r--src/com/benlinskey/grdbc/GRDBC.java51
-rw-r--r--src/com/benlinskey/grdbc/LexiconCreator.java214
-rw-r--r--src/com/benlinskey/grdbc/LexiconParser.java197
3 files changed, 462 insertions, 0 deletions
diff --git a/src/com/benlinskey/grdbc/GRDBC.java b/src/com/benlinskey/grdbc/GRDBC.java
new file mode 100644
index 0000000..9afcfe8
--- /dev/null
+++ b/src/com/benlinskey/grdbc/GRDBC.java
@@ -0,0 +1,51 @@
+/* Copyright 2013 Benjamin Linskey
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.benlinskey.grdbc;
+
+/**
+ * This class provides a command line interface for the program.
+ * @author Ben Linskey
+ */
+public class GRDBC {
+ public static void main(String[] args) {
+ if (args.length != 1) {
+ displayUsage();
+ System.exit(1);
+ }
+
+ String opt = args[0];
+ if (opt.equals("-a")) {
+ // TODO
+ } else if (opt.equals("-l")) {
+ (new LexiconCreator()).run();
+ } else if (opt.equals("-g")) {
+ // TODO
+ } else {
+ displayUsage();
+ }
+ }
+
+ /**
+ * Displays usage information for the program.
+ */
+ private static void displayUsage() {
+ System.out.println("Usage: java -jar grdbc.jar [option]\n");
+ System.out.println("Options:");
+ System.out.printf("%5s\t\t%20s\n", "-a", "Create all databases");
+ System.out.printf("%5s\t\t%20s\n", "-l", "Create lexicon database");
+ System.out.printf("%5s\t\t%20s\n", "-g", "Create grammar database");
+ }
+}
diff --git a/src/com/benlinskey/grdbc/LexiconCreator.java b/src/com/benlinskey/grdbc/LexiconCreator.java
new file mode 100644
index 0000000..7416b28
--- /dev/null
+++ b/src/com/benlinskey/grdbc/LexiconCreator.java
@@ -0,0 +1,214 @@
+/* Copyright 2013 Benjamin Linskey
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.benlinskey.grdbc;
+
+import java.io.BufferedReader;
+import java.io.FileNotFoundException;
+import java.io.FileReader;
+import java.io.IOException;
+import java.sql.Connection;
+import java.sql.DriverManager;
+import java.sql.PreparedStatement;
+import java.sql.SQLException;
+import java.sql.Statement;
+
+import javax.xml.parsers.ParserConfigurationException;
+
+import org.xml.sax.SAXException;
+
+/**
+ * Reads in an XML file containing a Greek lexicon and stores entries in an
+ * SQLite database.
+ * @author Ben Linskey
+ */
+public class LexiconCreator {
+ private final static String FILE = "../xml/Perseus_text_1999.04.0058.xml";
+ private final static String DB = "lexicon.db";
+ private final static String TABLE_NAME = "lexicon";
+ private Connection connection;
+ private PreparedStatement insertStatement;
+
+ /**
+ * Class constructor.
+ */
+ public LexiconCreator() {
+ // Load driver.
+ try {
+ Class.forName("org.sqlite.JDBC");
+ } catch (ClassNotFoundException e) {
+ e.printStackTrace();
+ System.exit(1);
+ }
+
+ // Connect to database.
+ try {
+ connection = DriverManager.getConnection("jdbc:sqlite:" + DB);
+ } catch (SQLException e) {
+ e.printStackTrace();
+ System.exit(1);
+ }
+
+ // Use batch inserts for speed.
+ try {
+ connection.setAutoCommit(false);
+ } catch (SQLException e) {
+ e.printStackTrace();
+ System.exit(1);
+ }
+
+ createDatabase();
+
+ // Create a prepared statement to use when inserting entries.
+ try {
+ insertStatement = connection.prepareStatement("INSERT INTO "
+ + TABLE_NAME + " VALUES (NULL, ?, ?, ?, ?, ?, ?)");
+ } catch (SQLException e) {
+ e.printStackTrace();
+ System.exit(1);
+ }
+ }
+
+ /**
+ * Creates the lexicon database.
+ */
+ public void run() {
+ addEntries();
+ createIndex();
+ try {
+ insertStatement.close();
+ connection.close();
+ } catch (SQLException e) {
+ e.printStackTrace();
+ System.exit(1);
+ }
+ System.out.println("Done.");
+ }
+
+ /**
+ * Resets the database if it already exists and creates a new, empty
+ * database.
+ */
+ private void createDatabase() {
+ System.out.println("Creating lexicon database...");
+ try {
+ String dropTable = "DROP TABLE IF EXISTS " + TABLE_NAME;
+ String createTable = "CREATE TABLE " + TABLE_NAME + " (" +
+ "_ID INT PRIMARY KEY, " +
+ "betaNoSymbols VARCHAR(100), " +
+ "betaSymbols VARCHAR(100), " +
+ "greekFullWord VARCHAR(100), " +
+ "greekNoSymbols VARCHAR(100), " +
+ "greekLowercase VARCHAR(100), " +
+ "entry TEXT)";
+ Statement statement = connection.createStatement();
+ statement.executeUpdate(dropTable);
+ statement.executeUpdate(createTable);
+ connection.commit();
+ statement.close();
+ } catch (SQLException e) {
+ e.printStackTrace();
+ System.exit(1);
+ }
+ }
+
+ /**
+ * Parses the XML file, modifies the lexicon entries, and inserts the
+ * modified entries into the database.
+ */
+ private void addEntries() {
+ System.out.println("Inserting entries...");
+
+ try {
+ BufferedReader in = new BufferedReader(new FileReader(FILE));
+ StringBuilder xml = new StringBuilder();
+
+ // Extract the XML for each lexicon entry, then process it.
+ while (in.ready()) {
+ String line = in.readLine();
+ if (line.startsWith("<entry ")) {
+ xml.delete(0, xml.length()); // Reset XML.
+ xml.append(line); // Add this line to new chunk of XML.
+ } else if (line.startsWith("</entry>")) {
+ xml.append(line);
+ processEntry(xml.toString());
+ } else {
+ xml.append(line);
+ }
+ }
+ in.close();
+
+ insertStatement.executeBatch();
+ connection.commit();
+ } catch (FileNotFoundException e) {
+ System.err.println("Error: Lexicon file not found.");
+ System.exit(1);
+ } catch (IOException e) {
+ e.printStackTrace();
+ System.exit(1);
+ } catch (SQLException e) {
+ e.printStackTrace();
+ System.exit(1);
+ }
+ }
+
+ /**
+ * Modifies the specified entry and inserts it into the database.
+ * @param xml the XML containing the entry to process
+ */
+ private void processEntry(String xml) {
+ try {
+ LexiconParser parser = new LexiconParser(xml);
+ insertStatement.setString(1, parser.getBetaNoSymbols());
+ insertStatement.setString(2, parser.getBetaSymbols());
+ insertStatement.setString(3, parser.getGreekFullWord());
+ insertStatement.setString(4, parser.getGreekNoSymbols());
+ insertStatement.setString(5, parser.getGreekLowercase());
+ insertStatement.setString(6, parser.getEntry());
+ insertStatement.addBatch();
+ } catch (ParserConfigurationException e) {
+ e.printStackTrace();
+ System.exit(1);
+ } catch (SAXException e) {
+ e.printStackTrace();
+ System.exit(1);
+ } catch (IOException e) {
+ e.printStackTrace();
+ System.exit(1);
+ } catch (SQLException e) {
+ e.printStackTrace();
+ System.exit(1);
+ }
+ }
+
+ /**
+ * Creates an index on the database to speed up searches.
+ */
+ private void createIndex() {
+ System.out.println("Creating index...");
+
+ // Create an index on the three columns matched against search queries.
+ String sql = "CREATE INDEX searchIndex ON " + TABLE_NAME +
+ " (betaNoSymbols, betaSymbols, greekNoSymbols)";
+ try {
+ Statement statement = connection.createStatement();
+ statement.executeUpdate(sql);
+ statement.close();
+ connection.commit();
+ } catch (SQLException e) {
+ e.printStackTrace();
+ }
+ }
+}
diff --git a/src/com/benlinskey/grdbc/LexiconParser.java b/src/com/benlinskey/grdbc/LexiconParser.java
new file mode 100644
index 0000000..46dcad5
--- /dev/null
+++ b/src/com/benlinskey/grdbc/LexiconParser.java
@@ -0,0 +1,197 @@
+/* Copyright 2013 Benjamin Linskey
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.benlinskey.grdbc;
+
+import java.io.IOException;
+import java.io.StringReader;
+import java.io.StringWriter;
+import java.io.UnsupportedEncodingException;
+
+import javax.xml.parsers.DocumentBuilder;
+import javax.xml.parsers.DocumentBuilderFactory;
+import javax.xml.parsers.ParserConfigurationException;
+import javax.xml.transform.Transformer;
+import javax.xml.transform.TransformerConfigurationException;
+import javax.xml.transform.TransformerException;
+import javax.xml.transform.TransformerFactory;
+import javax.xml.transform.dom.DOMSource;
+import javax.xml.transform.stream.StreamResult;
+
+import org.w3c.dom.Document;
+import org.w3c.dom.Node;
+import org.w3c.dom.NodeList;
+import org.xml.sax.InputSource;
+import org.xml.sax.SAXException;
+
+import edu.unc.epidoc.transcoder.TransCoder;
+
+/**
+ * This class provides methods to parse a chunk of XML containing a lexicon
+ * entry, modify the data contained therein, and return data to be inserted
+ * into the database.
+ * @author Ben Linskey
+ */
+public class LexiconParser {
+ private Document doc;
+ private TransCoder transcoder;
+
+ /**
+ * Class constructor.
+ * @param xml the XML to parse
+ * @throws ParserConfigurationException
+ * @throws IOException
+ * @throws SAXException
+ */
+ public LexiconParser(String xml) throws ParserConfigurationException, SAXException, IOException {
+ // Parse the XML and create a Document.
+ DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
+ DocumentBuilder db = dbf.newDocumentBuilder();
+ InputSource is = new InputSource(new StringReader(xml));
+ doc = db.parse(is);
+
+ // Create a TransCoder for converting Beta Code to Greek characters.
+ try {
+ transcoder = new TransCoder("BetaCode", "UnicodeC");
+ } catch (Exception e) {
+ e.printStackTrace();
+ System.exit(1);
+ }
+ }
+
+ /**
+ * Returns a Beta Code representation of this entry's word, stripped of
+ * all diacritics.
+ * @return this entry's word in Beta Code without diacritics
+ */
+ public String getBetaNoSymbols() {
+ // Get the word and replace all symbols with an empty string.
+ return getBetaSymbols().replaceAll("[^a-zA-Z]", "");
+ }
+
+ /**
+ * Returns a Beta Code representation of this entry's word.
+ * @return this entry's word in Beta Code
+ */
+ public String getBetaSymbols() {
+ // We just need the "key" attribute from the "entry" element.
+ Node entry = doc.getElementsByTagName("entry").item(0);
+ return entry.getAttributes().getNamedItem("key").getTextContent();
+ }
+
+ /**
+ * Returns this entry's word in Greek characters.
+ * @return this entry's word in Greek characters
+ */
+ public String getGreekFullWord() {
+ // Use the transcoder to convert the beta code to Greek.
+ return betaToGreek(getBetaSymbols());
+ }
+
+ /**
+ * Returns this entry's word in Greek characters, stripped of all
+ * diacritics.
+ * @return this entry's word in Greek characters without diacritics
+ */
+ public String getGreekNoSymbols() {
+ // Get beta code with no symbols other than the capital letter marker.
+ String beta = getBetaSymbols().replaceAll("[^a-zA-Z\\*]", "");
+
+ // Use the transcoder to convert the beta code to Greek.
+ return betaToGreek(beta);
+ }
+
+ /**
+ * Returns this entry's word in all lowercase Greek characters, stripped
+ * of all diacritics.
+ * @return this entry's word in lowercase Greek characters without
+ * diacritics
+ */
+ public String getGreekLowercase() {
+ return getGreekNoSymbols().toLowerCase();
+ }
+
+ /**
+ * Returns the XML for this entry, with all Beta Code converted to Greek
+ * characters.
+ * @return the XML for this entry with all Beta Code converted to Greek
+ * characters
+ */
+ public String getEntry() {
+ transcodeInElements("orth");
+ transcodeInElements("ref");
+ transcodeInElements("foreign");
+ return getUpdatedXML();
+ }
+
+ /**
+ * Transcodes beta code to Greek in elements with the given name.
+ * @param element the name of the element to search for
+ */
+ private void transcodeInElements(String element) {
+ NodeList nodeList = doc.getElementsByTagName(element);
+ for (int i = 0; i < nodeList.getLength(); i++) {
+ Node elementNode = nodeList.item(i);
+ Node langAttr = elementNode.getAttributes().getNamedItem("lang");
+ if (langAttr != null) {
+ String lang = langAttr.getTextContent();
+ if (lang.equals("greek")) {
+ String greek = betaToGreek(elementNode.getTextContent());
+ langAttr.setTextContent(greek);
+ }
+ }
+ }
+ }
+
+ /**
+ * Returns a string containing an XML representation of the document in its
+ * current state.
+ * @return a string containing an XML representation of the document in its
+ * current state
+ */
+ private String getUpdatedXML() {
+ StringWriter writer = new StringWriter();
+ try {
+ TransformerFactory tf = TransformerFactory.newInstance();
+ Transformer transformer = tf.newTransformer();
+ DOMSource source = new DOMSource(doc);
+ StreamResult result = new StreamResult(writer);
+ transformer.transform(source, result);
+ } catch (TransformerConfigurationException e) {
+ e.printStackTrace();
+ System.exit(1);
+ } catch (TransformerException e) {
+ e.printStackTrace();
+ System.exit(1);
+ }
+ return writer.toString();
+ }
+
+ /**
+ * Converts Beta Code to Greek characters.
+ * @param beta the Beta Code to transcode
+ * @return the Greek equivalent of the specified Beta Code
+ */
+ private String betaToGreek(String beta) {
+ String greek = null;
+ try {
+ greek = transcoder.getString(beta);
+ } catch (UnsupportedEncodingException e) {
+ e.printStackTrace();
+ System.exit(1);
+ }
+ return greek;
+ }
+}