From e2371bbe19443fe43d5e16f86f811d092e1406ce Mon Sep 17 00:00:00 2001 From: Ben Linskey Date: Mon, 16 Dec 2013 19:29:01 -0500 Subject: Add interface and lexicon database creation code. --- src/com/benlinskey/grdbc/GRDBC.java | 51 +++++++ src/com/benlinskey/grdbc/LexiconCreator.java | 214 +++++++++++++++++++++++++++ src/com/benlinskey/grdbc/LexiconParser.java | 197 ++++++++++++++++++++++++ 3 files changed, 462 insertions(+) create mode 100644 src/com/benlinskey/grdbc/GRDBC.java create mode 100644 src/com/benlinskey/grdbc/LexiconCreator.java create mode 100644 src/com/benlinskey/grdbc/LexiconParser.java (limited to 'src') diff --git a/src/com/benlinskey/grdbc/GRDBC.java b/src/com/benlinskey/grdbc/GRDBC.java new file mode 100644 index 0000000..9afcfe8 --- /dev/null +++ b/src/com/benlinskey/grdbc/GRDBC.java @@ -0,0 +1,51 @@ +/* Copyright 2013 Benjamin Linskey + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.benlinskey.grdbc; + +/** + * This class provides a command line interface for the program. + * @author Ben Linskey + */ +public class GRDBC { + public static void main(String[] args) { + if (args.length != 1) { + displayUsage(); + System.exit(1); + } + + String opt = args[0]; + if (opt.equals("-a")) { + // TODO + } else if (opt.equals("-l")) { + (new LexiconCreator()).run(); + } else if (opt.equals("-g")) { + // TODO + } else { + displayUsage(); + } + } + + /** + * Displays usage information for the program. + */ + private static void displayUsage() { + System.out.println("Usage: java -jar grdbc.jar [option]\n"); + System.out.println("Options:"); + System.out.printf("%5s\t\t%20s\n", "-a", "Create all databases"); + System.out.printf("%5s\t\t%20s\n", "-l", "Create lexicon database"); + System.out.printf("%5s\t\t%20s\n", "-g", "Create grammar database"); + } +} diff --git a/src/com/benlinskey/grdbc/LexiconCreator.java b/src/com/benlinskey/grdbc/LexiconCreator.java new file mode 100644 index 0000000..7416b28 --- /dev/null +++ b/src/com/benlinskey/grdbc/LexiconCreator.java @@ -0,0 +1,214 @@ +/* Copyright 2013 Benjamin Linskey + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.benlinskey.grdbc; + +import java.io.BufferedReader; +import java.io.FileNotFoundException; +import java.io.FileReader; +import java.io.IOException; +import java.sql.Connection; +import java.sql.DriverManager; +import java.sql.PreparedStatement; +import java.sql.SQLException; +import java.sql.Statement; + +import javax.xml.parsers.ParserConfigurationException; + +import org.xml.sax.SAXException; + +/** + * Reads in an XML file containing a Greek lexicon and stores entries in an + * SQLite database. + * @author Ben Linskey + */ +public class LexiconCreator { + private final static String FILE = "../xml/Perseus_text_1999.04.0058.xml"; + private final static String DB = "lexicon.db"; + private final static String TABLE_NAME = "lexicon"; + private Connection connection; + private PreparedStatement insertStatement; + + /** + * Class constructor. + */ + public LexiconCreator() { + // Load driver. + try { + Class.forName("org.sqlite.JDBC"); + } catch (ClassNotFoundException e) { + e.printStackTrace(); + System.exit(1); + } + + // Connect to database. + try { + connection = DriverManager.getConnection("jdbc:sqlite:" + DB); + } catch (SQLException e) { + e.printStackTrace(); + System.exit(1); + } + + // Use batch inserts for speed. + try { + connection.setAutoCommit(false); + } catch (SQLException e) { + e.printStackTrace(); + System.exit(1); + } + + createDatabase(); + + // Create a prepared statement to use when inserting entries. + try { + insertStatement = connection.prepareStatement("INSERT INTO " + + TABLE_NAME + " VALUES (NULL, ?, ?, ?, ?, ?, ?)"); + } catch (SQLException e) { + e.printStackTrace(); + System.exit(1); + } + } + + /** + * Creates the lexicon database. + */ + public void run() { + addEntries(); + createIndex(); + try { + insertStatement.close(); + connection.close(); + } catch (SQLException e) { + e.printStackTrace(); + System.exit(1); + } + System.out.println("Done."); + } + + /** + * Resets the database if it already exists and creates a new, empty + * database. + */ + private void createDatabase() { + System.out.println("Creating lexicon database..."); + try { + String dropTable = "DROP TABLE IF EXISTS " + TABLE_NAME; + String createTable = "CREATE TABLE " + TABLE_NAME + " (" + + "_ID INT PRIMARY KEY, " + + "betaNoSymbols VARCHAR(100), " + + "betaSymbols VARCHAR(100), " + + "greekFullWord VARCHAR(100), " + + "greekNoSymbols VARCHAR(100), " + + "greekLowercase VARCHAR(100), " + + "entry TEXT)"; + Statement statement = connection.createStatement(); + statement.executeUpdate(dropTable); + statement.executeUpdate(createTable); + connection.commit(); + statement.close(); + } catch (SQLException e) { + e.printStackTrace(); + System.exit(1); + } + } + + /** + * Parses the XML file, modifies the lexicon entries, and inserts the + * modified entries into the database. + */ + private void addEntries() { + System.out.println("Inserting entries..."); + + try { + BufferedReader in = new BufferedReader(new FileReader(FILE)); + StringBuilder xml = new StringBuilder(); + + // Extract the XML for each lexicon entry, then process it. + while (in.ready()) { + String line = in.readLine(); + if (line.startsWith("")) { + xml.append(line); + processEntry(xml.toString()); + } else { + xml.append(line); + } + } + in.close(); + + insertStatement.executeBatch(); + connection.commit(); + } catch (FileNotFoundException e) { + System.err.println("Error: Lexicon file not found."); + System.exit(1); + } catch (IOException e) { + e.printStackTrace(); + System.exit(1); + } catch (SQLException e) { + e.printStackTrace(); + System.exit(1); + } + } + + /** + * Modifies the specified entry and inserts it into the database. + * @param xml the XML containing the entry to process + */ + private void processEntry(String xml) { + try { + LexiconParser parser = new LexiconParser(xml); + insertStatement.setString(1, parser.getBetaNoSymbols()); + insertStatement.setString(2, parser.getBetaSymbols()); + insertStatement.setString(3, parser.getGreekFullWord()); + insertStatement.setString(4, parser.getGreekNoSymbols()); + insertStatement.setString(5, parser.getGreekLowercase()); + insertStatement.setString(6, parser.getEntry()); + insertStatement.addBatch(); + } catch (ParserConfigurationException e) { + e.printStackTrace(); + System.exit(1); + } catch (SAXException e) { + e.printStackTrace(); + System.exit(1); + } catch (IOException e) { + e.printStackTrace(); + System.exit(1); + } catch (SQLException e) { + e.printStackTrace(); + System.exit(1); + } + } + + /** + * Creates an index on the database to speed up searches. + */ + private void createIndex() { + System.out.println("Creating index..."); + + // Create an index on the three columns matched against search queries. + String sql = "CREATE INDEX searchIndex ON " + TABLE_NAME + + " (betaNoSymbols, betaSymbols, greekNoSymbols)"; + try { + Statement statement = connection.createStatement(); + statement.executeUpdate(sql); + statement.close(); + connection.commit(); + } catch (SQLException e) { + e.printStackTrace(); + } + } +} diff --git a/src/com/benlinskey/grdbc/LexiconParser.java b/src/com/benlinskey/grdbc/LexiconParser.java new file mode 100644 index 0000000..46dcad5 --- /dev/null +++ b/src/com/benlinskey/grdbc/LexiconParser.java @@ -0,0 +1,197 @@ +/* Copyright 2013 Benjamin Linskey + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.benlinskey.grdbc; + +import java.io.IOException; +import java.io.StringReader; +import java.io.StringWriter; +import java.io.UnsupportedEncodingException; + +import javax.xml.parsers.DocumentBuilder; +import javax.xml.parsers.DocumentBuilderFactory; +import javax.xml.parsers.ParserConfigurationException; +import javax.xml.transform.Transformer; +import javax.xml.transform.TransformerConfigurationException; +import javax.xml.transform.TransformerException; +import javax.xml.transform.TransformerFactory; +import javax.xml.transform.dom.DOMSource; +import javax.xml.transform.stream.StreamResult; + +import org.w3c.dom.Document; +import org.w3c.dom.Node; +import org.w3c.dom.NodeList; +import org.xml.sax.InputSource; +import org.xml.sax.SAXException; + +import edu.unc.epidoc.transcoder.TransCoder; + +/** + * This class provides methods to parse a chunk of XML containing a lexicon + * entry, modify the data contained therein, and return data to be inserted + * into the database. + * @author Ben Linskey + */ +public class LexiconParser { + private Document doc; + private TransCoder transcoder; + + /** + * Class constructor. + * @param xml the XML to parse + * @throws ParserConfigurationException + * @throws IOException + * @throws SAXException + */ + public LexiconParser(String xml) throws ParserConfigurationException, SAXException, IOException { + // Parse the XML and create a Document. + DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance(); + DocumentBuilder db = dbf.newDocumentBuilder(); + InputSource is = new InputSource(new StringReader(xml)); + doc = db.parse(is); + + // Create a TransCoder for converting Beta Code to Greek characters. + try { + transcoder = new TransCoder("BetaCode", "UnicodeC"); + } catch (Exception e) { + e.printStackTrace(); + System.exit(1); + } + } + + /** + * Returns a Beta Code representation of this entry's word, stripped of + * all diacritics. + * @return this entry's word in Beta Code without diacritics + */ + public String getBetaNoSymbols() { + // Get the word and replace all symbols with an empty string. + return getBetaSymbols().replaceAll("[^a-zA-Z]", ""); + } + + /** + * Returns a Beta Code representation of this entry's word. + * @return this entry's word in Beta Code + */ + public String getBetaSymbols() { + // We just need the "key" attribute from the "entry" element. + Node entry = doc.getElementsByTagName("entry").item(0); + return entry.getAttributes().getNamedItem("key").getTextContent(); + } + + /** + * Returns this entry's word in Greek characters. + * @return this entry's word in Greek characters + */ + public String getGreekFullWord() { + // Use the transcoder to convert the beta code to Greek. + return betaToGreek(getBetaSymbols()); + } + + /** + * Returns this entry's word in Greek characters, stripped of all + * diacritics. + * @return this entry's word in Greek characters without diacritics + */ + public String getGreekNoSymbols() { + // Get beta code with no symbols other than the capital letter marker. + String beta = getBetaSymbols().replaceAll("[^a-zA-Z\\*]", ""); + + // Use the transcoder to convert the beta code to Greek. + return betaToGreek(beta); + } + + /** + * Returns this entry's word in all lowercase Greek characters, stripped + * of all diacritics. + * @return this entry's word in lowercase Greek characters without + * diacritics + */ + public String getGreekLowercase() { + return getGreekNoSymbols().toLowerCase(); + } + + /** + * Returns the XML for this entry, with all Beta Code converted to Greek + * characters. + * @return the XML for this entry with all Beta Code converted to Greek + * characters + */ + public String getEntry() { + transcodeInElements("orth"); + transcodeInElements("ref"); + transcodeInElements("foreign"); + return getUpdatedXML(); + } + + /** + * Transcodes beta code to Greek in elements with the given name. + * @param element the name of the element to search for + */ + private void transcodeInElements(String element) { + NodeList nodeList = doc.getElementsByTagName(element); + for (int i = 0; i < nodeList.getLength(); i++) { + Node elementNode = nodeList.item(i); + Node langAttr = elementNode.getAttributes().getNamedItem("lang"); + if (langAttr != null) { + String lang = langAttr.getTextContent(); + if (lang.equals("greek")) { + String greek = betaToGreek(elementNode.getTextContent()); + langAttr.setTextContent(greek); + } + } + } + } + + /** + * Returns a string containing an XML representation of the document in its + * current state. + * @return a string containing an XML representation of the document in its + * current state + */ + private String getUpdatedXML() { + StringWriter writer = new StringWriter(); + try { + TransformerFactory tf = TransformerFactory.newInstance(); + Transformer transformer = tf.newTransformer(); + DOMSource source = new DOMSource(doc); + StreamResult result = new StreamResult(writer); + transformer.transform(source, result); + } catch (TransformerConfigurationException e) { + e.printStackTrace(); + System.exit(1); + } catch (TransformerException e) { + e.printStackTrace(); + System.exit(1); + } + return writer.toString(); + } + + /** + * Converts Beta Code to Greek characters. + * @param beta the Beta Code to transcode + * @return the Greek equivalent of the specified Beta Code + */ + private String betaToGreek(String beta) { + String greek = null; + try { + greek = transcoder.getString(beta); + } catch (UnsupportedEncodingException e) { + e.printStackTrace(); + System.exit(1); + } + return greek; + } +} -- cgit v1.2.3