diff options
author | Franciszek Malinka <franciszek.malinka@gmail.com> | 2022-11-06 23:45:03 +0100 |
---|---|---|
committer | Franciszek Malinka <franciszek.malinka@gmail.com> | 2022-11-06 23:48:26 +0100 |
commit | b267dde1715479f38a2c73cc4c3039fff8618945 (patch) | |
tree | bee78c6667af0ffcb75366acc0475421febb0dba | |
parent | 245e25b0662049809aa94718caef863ec5a48e0e (diff) |
Ported HeapPageEncoder for our usage.
-rw-r--r-- | encoder/HeapFileEncoder.java | 300 | ||||
-rw-r--r-- | encoder/README.md | 14 |
2 files changed, 314 insertions, 0 deletions
diff --git a/encoder/HeapFileEncoder.java b/encoder/HeapFileEncoder.java new file mode 100644 index 0000000..ef5be93 --- /dev/null +++ b/encoder/HeapFileEncoder.java @@ -0,0 +1,300 @@ +import java.io.*; +import java.util.Arrays; +import java.util.List; + +/** + * HeapFileEncoder reads a comma delimited text file or accepts + * an array of tuples and converts it to + * pages of binary data in the appropriate format for simpledb heap pages + * Pages are padded out to a specified length, and written consecutive in a + * data file. + */ + +public class HeapFileEncoder { + + /** + * Class representing a type in SimpleDB. + * Types are static objects defined by this class; hence, the Type + * constructor is private. + */ + public enum Type implements Serializable { + INT_TYPE() { + @Override + public int getLen() { + return 4; + } + + }, STRING_TYPE() { + @Override + public int getLen() { + return STRING_LEN + 4; + } + }; + public static final int STRING_LEN = 128; + public abstract int getLen(); + } + + public static void main(String[] args) { + try { + if (args.length < 2 || args.length > 4) { + System.err.println("Unexpected number of arguments to convert "); + return; + } + File sourceTxtFile = new File(args[0]); + File targetDatFile = new File(args[0].replaceAll(".txt", ".dat")); + int numOfAttributes = Integer.parseInt(args[1]); + Type[] ts = new Type[numOfAttributes]; + char fieldSeparator = ','; + + if (args.length == 2) + for (int i = 0; i < numOfAttributes; i++) + ts[i] = Type.INT_TYPE; + else { + String typeString = args[2]; + String[] typeStringAr = typeString.split(","); + if (typeStringAr.length != numOfAttributes) { + System.err.println("The number of types does not agree with the number of columns"); + return; + } + int index = 0; + for (String s : typeStringAr) { + if (s.equalsIgnoreCase("int")) + ts[index++] = Type.INT_TYPE; + else if (s.equalsIgnoreCase("string")) + ts[index++] = Type.STRING_TYPE; + else { + System.err.println("Unknown type " + s); + return; + } + } + if (args.length == 5) + fieldSeparator = args[3].charAt(0); + } + + HeapFileEncoder.convert(sourceTxtFile, targetDatFile, + 4096, numOfAttributes, ts, fieldSeparator); + + } catch (IOException e) { + throw new RuntimeException(e); + } + } + + + /** + * Convert the specified tuple list (with only integer fields) into a binary + * page file. <br> + * <p> + * The format of the output file will be as specified in HeapPage and + * HeapFile. + * + * @param tuples the tuples - a list of tuples, each represented by a list of integers that are + * the field values for that tuple. + * @param outFile The output file to write data to + * @param npagebytes The number of bytes per page in the output file + * @param numFields the number of fields in each input tuple + * @throws IOException if the temporary/output file can't be opened + * @see HeapPage + * @see HeapFile + */ + public static void convert(List<List<Integer>> tuples, File outFile, int npagebytes, int numFields) throws IOException { + File tempInput = File.createTempFile("tempTable", ".txt"); + tempInput.deleteOnExit(); + BufferedWriter bw = new BufferedWriter(new FileWriter(tempInput)); + for (List<Integer> tuple : tuples) { + int writtenFields = 0; + for (Integer field : tuple) { + writtenFields++; + if (writtenFields > numFields) { + throw new RuntimeException("Tuple has more than " + numFields + " fields: (" + + listToString(tuple) + ")"); + } + bw.write(String.valueOf(field)); + if (writtenFields < numFields) { + bw.write(','); + } + } + bw.write('\n'); + } + bw.close(); + convert(tempInput, outFile, npagebytes, numFields); + } + + public static void convert(File inFile, File outFile, int npagebytes, + int numFields) throws IOException { + Type[] ts = new Type[numFields]; + Arrays.fill(ts, Type.INT_TYPE); + convert(inFile, outFile, npagebytes, numFields, ts); + } + + public static void convert(File inFile, File outFile, int npagebytes, + int numFields, Type[] typeAr) + throws IOException { + convert(inFile, outFile, npagebytes, numFields, typeAr, ','); + } + + /** + * Convert the specified input text file into a binary + * page file. <br> + * Assume format of the input file is (note that only integer fields are + * supported):<br> + * int,...,int\n<br> + * int,...,int\n<br> + * ...<br> + * where each row represents a tuple.<br> + * <p> + * The format of the output file will be as specified in HeapPage and + * HeapFile. + * + * @param inFile The input file to read data from + * @param outFile The output file to write data to + * @param npagebytes The number of bytes per page in the output file + * @param numFields the number of fields in each input line/output tuple + * @throws IOException if the input/output file can't be opened or a + * malformed input line is encountered + * @see HeapPage + * @see HeapFile + */ + public static void convert(File inFile, File outFile, int npagebytes, + int numFields, Type[] typeAr, char fieldSeparator) + throws IOException { + + int nrecbytes = 0; + for (int i = 0; i < numFields; i++) { + nrecbytes += typeAr[i].getLen(); + } + int nrecords = (npagebytes * 8) / (nrecbytes * 8 + 1); //floor comes for free + + // per record, we need one bit; there are nrecords per page, so we need + // nrecords bits, i.e., ((nrecords/32)+1) integers. + int nheaderbytes = (nrecords / 8); + if (nheaderbytes * 8 < nrecords) + nheaderbytes++; //ceiling + int nheaderbits = nheaderbytes * 8; + + BufferedReader br = new BufferedReader(new FileReader(inFile)); + FileOutputStream os = new FileOutputStream(outFile); + + // our numbers probably won't be much larger than 1024 digits + char[] buf = new char[1024]; + + int curpos = 0; + int recordcount = 0; + int npages = 0; + int fieldNo = 0; + + ByteArrayOutputStream headerBAOS = new ByteArrayOutputStream(nheaderbytes); + DataOutputStream headerStream = new DataOutputStream(headerBAOS); + ByteArrayOutputStream pageBAOS = new ByteArrayOutputStream(npagebytes); + DataOutputStream pageStream = new DataOutputStream(pageBAOS); + + boolean done = false; + boolean first = true; + while (!done) { + int c = br.read(); + + // Ignore Windows/Notepad special line endings + if (c == '\r') + continue; + + if (c == '\n') { + if (first) + continue; + recordcount++; + first = true; + } else + first = false; + if (c == fieldSeparator || c == '\n' || c == '\r') { + String s = new String(buf, 0, curpos); + if (typeAr[fieldNo] == Type.INT_TYPE) { + try { + pageStream.writeInt(Integer.parseInt(s.trim())); + } catch (NumberFormatException e) { + System.out.println("BAD LINE : " + s); + } + } else if (typeAr[fieldNo] == Type.STRING_TYPE) { + s = s.trim(); + int overflow = Type.STRING_LEN - s.length(); + if (overflow < 0) { + s = s.substring(0, Type.STRING_LEN); + } + pageStream.writeInt(s.length()); + pageStream.writeBytes(s); + while (overflow-- > 0) + pageStream.write((byte) 0); + } + curpos = 0; + if (c == '\n') + fieldNo = 0; + else + fieldNo++; + + } else if (c == -1) { + done = true; + + } else { + buf[curpos++] = (char) c; + continue; + } + + // if we wrote a full page of records, or if we're done altogether, + // write out the header of the page. + // + // in the header, write a 1 for bits that correspond to records we've + // written and 0 for empty slots. + // + // when we're done, also flush the page to disk, but only if it has + // records on it. however, if this file is empty, do flush an empty + // page to disk. + if (recordcount >= nrecords + || done && recordcount > 0 + || done && npages == 0) { + int i = 0; + byte headerbyte = 0; + + for (i = 0; i < nheaderbits; i++) { + if (i < recordcount) + headerbyte |= (1 << (i % 8)); + + if (((i + 1) % 8) == 0) { + headerStream.writeByte(headerbyte); + headerbyte = 0; + } + } + + if (i % 8 > 0) + headerStream.writeByte(headerbyte); + + // pad the rest of the page with zeroes + + for (i = 0; i < (npagebytes - (recordcount * nrecbytes + nheaderbytes)); i++) + pageStream.writeByte(0); + + // write header and body to file + headerStream.flush(); + headerBAOS.writeTo(os); + pageStream.flush(); + pageBAOS.writeTo(os); + + // reset header and body for next page + headerBAOS = new ByteArrayOutputStream(nheaderbytes); + headerStream = new DataOutputStream(headerBAOS); + pageBAOS = new ByteArrayOutputStream(npagebytes); + pageStream = new DataOutputStream(pageBAOS); + + recordcount = 0; + npages++; + } + } + br.close(); + os.close(); + } + + public static String listToString(List<Integer> list) { + StringBuilder out = new StringBuilder(); + for (Integer i : list) { + if (out.length() > 0) out.append("\t"); + out.append(i); + } + return out.toString(); + } +} diff --git a/encoder/README.md b/encoder/README.md new file mode 100644 index 0000000..39dc3bc --- /dev/null +++ b/encoder/README.md @@ -0,0 +1,14 @@ +# Heap file encoder + +This is taken and patched from the MIT SimpleDB. Simple usage: + +- Create a test file `test.txt`: + ``` + 1,lol,2,xD + 3,lmao,4,siema + ``` +- Then compile and run the encoder: + ``` + javac HeapFileEncoder.java + java HeapFileEncoder test.txt 4 int,string,int,string + ``` |