commit acd1f8cc7ec936286b230d26d8b950e0af81e92d Author: midnight Date: Mon Aug 12 17:08:47 2024 +0200 first commit diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..466e248 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +out/ \ No newline at end of file diff --git a/app.ts b/app.ts new file mode 100644 index 0000000..adc3250 --- /dev/null +++ b/app.ts @@ -0,0 +1,62 @@ +export default class App { + input: HTMLInputElement; + outputContainer: HTMLTextAreaElement; + + constructor() { + this.input = document.getElementById('in') as HTMLInputElement; + this.outputContainer = document.getElementById('out') as HTMLTextAreaElement; + (document.getElementById("run") as HTMLButtonElement).onclick = this.run.bind(this); + } + + async run() { + const promises: Promise[] = []; + for (const file of this.input.files!) { + promises.push(this.parseFile(file)); + } + + const parts = await Promise.all(promises); + + const output = new Blob([parts.join(' ')], { type: 'text/plain' }); + const a = document.createElement('a'); + a.href = URL.createObjectURL(output); + a.download = "result.txt"; + a.click(); + } + + async parseFile(file: File) { + const xml = await file.text(); + const parser = new DOMParser(); + const doc = parser.parseFromString(xml, "text/xml"); + + const results: string[] = []; + + for (const node of doc.getElementsByTagNameNS('*', 'tartalom')) { + let resolve: (value: string) => void, reject: (reason?: any) => void; + const resultPromise = new Promise((res, rej) => { + resolve = res, reject = rej; + }); + + const worker = new Worker('worker.js'); + + worker.onmessage = (e) => { + resolve(e.data); + }; + + worker.onerror = (e) => { + reject(e.error); + } + + worker.postMessage(node.textContent); + + try { + results.push(await resultPromise); + } catch (e) { + console.error(e); + } + } + + return results.join(' '); + } +} + +new App(); \ No newline at end of file diff --git a/index.html b/index.html new file mode 100644 index 0000000..7677312 --- /dev/null +++ b/index.html @@ -0,0 +1,37 @@ + + + + + + + + XML Cleaner + + + + + + + + + \ No newline at end of file diff --git a/readme.md b/readme.md new file mode 100644 index 0000000..34da302 --- /dev/null +++ b/readme.md @@ -0,0 +1,16 @@ +# XML Cleaner + +Simple webapp to clean up XML datasets. + +Currently hardcoded to find the `tartalom` tags in multiple files, +and dump their text content into a single txt file. + +Removes all xml tags contained in CDATA strings, along with unnecessary whitespaces. + +## Building +1. Run the Typescript compiler. +``` +tsc +``` + +2. Deploy `index.html` and the `out` folder to a webserver. \ No newline at end of file diff --git a/tsconfig.json b/tsconfig.json new file mode 100644 index 0000000..5a6aa8c --- /dev/null +++ b/tsconfig.json @@ -0,0 +1,11 @@ +{ + "compilerOptions": { + "target": "ESNext", + "module": "ESNext", + "outDir": "./out", + "esModuleInterop": true, + "forceConsistentCasingInFileNames": true, + "strict": true, + "skipLibCheck": true + } +} \ No newline at end of file diff --git a/worker.ts b/worker.ts new file mode 100644 index 0000000..809e623 --- /dev/null +++ b/worker.ts @@ -0,0 +1,14 @@ +const singleTagRegEx = /<[^>]+?>/g; +const whitespaceRegEx = /(\s)\s+/g; + +function clean(text: string): string { + text = text.replaceAll(whitespaceRegEx, "$1"); + text = text.replaceAll(singleTagRegEx, ""); + + return text; +} + +onmessage = (e) => { + const result = clean(e.data); + postMessage(result); +} \ No newline at end of file