first commit

This commit is contained in:
2024-08-12 17:08:47 +02:00
commit acd1f8cc7e
6 changed files with 141 additions and 0 deletions

1
.gitignore vendored Normal file
View File

@@ -0,0 +1 @@
out/

62
app.ts Normal file
View File

@@ -0,0 +1,62 @@
export default class App {
input: HTMLInputElement;
outputContainer: HTMLTextAreaElement;
constructor() {
this.input = document.getElementById('in') as HTMLInputElement;
this.outputContainer = document.getElementById('out') as HTMLTextAreaElement;
(document.getElementById("run") as HTMLButtonElement).onclick = this.run.bind(this);
}
async run() {
const promises: Promise<string>[] = [];
for (const file of this.input.files!) {
promises.push(this.parseFile(file));
}
const parts = await Promise.all(promises);
const output = new Blob([parts.join(' ')], { type: 'text/plain' });
const a = document.createElement('a');
a.href = URL.createObjectURL(output);
a.download = "result.txt";
a.click();
}
async parseFile(file: File) {
const xml = await file.text();
const parser = new DOMParser();
const doc = parser.parseFromString(xml, "text/xml");
const results: string[] = [];
for (const node of doc.getElementsByTagNameNS('*', 'tartalom')) {
let resolve: (value: string) => void, reject: (reason?: any) => void;
const resultPromise = new Promise<string>((res, rej) => {
resolve = res, reject = rej;
});
const worker = new Worker('worker.js');
worker.onmessage = (e) => {
resolve(e.data);
};
worker.onerror = (e) => {
reject(e.error);
}
worker.postMessage(node.textContent);
try {
results.push(await resultPromise);
} catch (e) {
console.error(e);
}
}
return results.join(' ');
}
}
new App();

37
index.html Normal file
View File

@@ -0,0 +1,37 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<meta name="description" content="XML Cleaner">
<title>XML Cleaner</title>
<style>
body,
html {
margin: auto;
height: 100%;
}
body {
color: white;
background-color: black;
display: flex;
max-width: 800px;
justify-content: center;
align-items: center;
}
textarea {
width: 100%;
height: 100%;
}
</style>
</head>
<body>
<input type="file" id="in" multiple accept="text/xml"><button id="run">RUN</button>
<script type="module" src="out/app.js"></script>
</body>
</html>

16
readme.md Normal file
View File

@@ -0,0 +1,16 @@
# XML Cleaner
Simple webapp to clean up XML datasets.
Currently hardcoded to find the `tartalom` tags in multiple files,
and dump their text content into a single txt file.
Removes all xml tags contained in CDATA strings, along with unnecessary whitespaces.
## Building
1. Run the Typescript compiler.
```
tsc
```
2. Deploy `index.html` and the `out` folder to a webserver.

11
tsconfig.json Normal file
View File

@@ -0,0 +1,11 @@
{
"compilerOptions": {
"target": "ESNext",
"module": "ESNext",
"outDir": "./out",
"esModuleInterop": true,
"forceConsistentCasingInFileNames": true,
"strict": true,
"skipLibCheck": true
}
}

14
worker.ts Normal file
View File

@@ -0,0 +1,14 @@
const singleTagRegEx = /<[^>]+?>/g;
const whitespaceRegEx = /(\s)\s+/g;
function clean(text: string): string {
text = text.replaceAll(whitespaceRegEx, "$1");
text = text.replaceAll(singleTagRegEx, "");
return text;
}
onmessage = (e) => {
const result = clean(e.data);
postMessage(result);
}