From 711c3b4802ce5835c36680967e5ee6afc040d11c Mon Sep 17 00:00:00 2001
From: retoor <retoor@molodetz.nl>
Date: Fri, 22 Nov 2024 20:37:42 +0100
Subject: [PATCH] Last version

---
 .gitignore       |  1 +
 README.md        | 24 +++++++++++++++++++++++
 pdf2text         | 51 ++++++++++++++++++++++++++++++++++++++++++++++++
 requirements.txt |  2 ++
 4 files changed, 78 insertions(+)
 create mode 100644 .gitignore
 create mode 100644 README.md
 create mode 100755 pdf2text
 create mode 100644 requirements.txt

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..5ceb386
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1 @@
+venv
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..b3e81ce
--- /dev/null
+++ b/README.md
@@ -0,0 +1,24 @@
+# PDF2Text
+
+I've converted 8gb of PDF's to text in one afternoon on a decade old x270 using this script. Performant enough imho. Try to get 8Gb in your LLM and getting it to actually use it. That's the challenge.
+
+## Convert all PDF's to text
+This is an script for converting a batch of PDF's to text for machine learning.
+It only has two dependencies:
+ - python3
+ - pdf.miner (python requirement, specified in requirements.txt file) 
+
+## Installation
+```bash
+python3 -m venv .venv
+source .venv/bin/activate
+pip install -r requirements.txt
+```
+
+## Usage:
+Activate your virtual environment.
+```bash
+source .venv/bin/activate
+./pdf2text [source/destination dir]
+```
+You read that correctly, the source directory is also the destination directory.
diff --git a/pdf2text b/pdf2text
new file mode 100755
index 0000000..53851b1
--- /dev/null
+++ b/pdf2text
@@ -0,0 +1,51 @@
+#!/usr/bin/env python
+import pathlib
+import sys
+from pdfminer.high_level import extract_text
+import time
+
+
+def pdf_to_txt(pdf_path, txt_path):
+    try:
+        text = extract_text(pdf_path)
+        with txt_path.open("w+", encoding="utf-8") as f:
+            f.write(text)
+    except Exception as ex:
+        raise
+    return True
+
+
+try:
+    source_path = sys.argv[1]
+except IndexError:
+    raise Exception(f"Usage: pdf2text [path].")
+
+source_path = pathlib.Path(source_path)
+if not source_path.exists():
+    raise Exception(f"{source_path.absolute()} does not exist.")
+
+print("This script will convert all your pdf files to txt files in the same directory.")
+if input("Continue? [Y/n]: ").strip().lower() in ["n", "no"]:
+    print("Operation cancelled.")
+    exit(0)
+
+for file in pathlib.Path(source_path).glob("*.pdf"):
+
+    source_file = file.absolute()
+    destination_file = pathlib.Path(str(file.absolute())[:-4] + ".txt")
+    if destination_file.exists():
+        print(f"Already exists, skipping: {destination_file.absolute()}.")
+        continue
+
+    print(f"Creating to {destination_file.absolute()}.")
+
+    start = time.time()
+    try:
+        pdf_to_txt(source_file, destination_file)
+    except Exception as ex:
+        print(ex)
+        # Just continue, who cares
+
+    finish = time.time()
+    duration = finish - start
+    print(f"Took {duration}s.")
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..3359bda
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,2 @@
+pdfminer.six
+