From 711c3b4802ce5835c36680967e5ee6afc040d11c Mon Sep 17 00:00:00 2001 From: retoor Date: Fri, 22 Nov 2024 20:37:42 +0100 Subject: [PATCH] Last version --- .gitignore | 1 + README.md | 24 +++++++++++++++++++++++ pdf2text | 51 ++++++++++++++++++++++++++++++++++++++++++++++++ requirements.txt | 2 ++ 4 files changed, 78 insertions(+) create mode 100644 .gitignore create mode 100644 README.md create mode 100755 pdf2text create mode 100644 requirements.txt diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..5ceb386 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +venv diff --git a/README.md b/README.md new file mode 100644 index 0000000..b3e81ce --- /dev/null +++ b/README.md @@ -0,0 +1,24 @@ +# PDF2Text + +I've converted 8gb of PDF's to text in one afternoon on a decade old x270 using this script. Performant enough imho. Try to get 8Gb in your LLM and getting it to actually use it. That's the challenge. + +## Convert all PDF's to text +This is an script for converting a batch of PDF's to text for machine learning. +It only has two dependencies: + - python3 + - pdf.miner (python requirement, specified in requirements.txt file) + +## Installation +```bash +python3 -m venv .venv +source .venv/bin/activate +pip install -r requirements.txt +``` + +## Usage: +Activate your virtual environment. +```bash +source .venv/bin/activate +./pdf2text [source/destination dir] +``` +You read that correctly, the source directory is also the destination directory. diff --git a/pdf2text b/pdf2text new file mode 100755 index 0000000..53851b1 --- /dev/null +++ b/pdf2text @@ -0,0 +1,51 @@ +#!/usr/bin/env python +import pathlib +import sys +from pdfminer.high_level import extract_text +import time + + +def pdf_to_txt(pdf_path, txt_path): + try: + text = extract_text(pdf_path) + with txt_path.open("w+", encoding="utf-8") as f: + f.write(text) + except Exception as ex: + raise + return True + + +try: + source_path = sys.argv[1] +except IndexError: + raise Exception(f"Usage: pdf2text [path].") + +source_path = pathlib.Path(source_path) +if not source_path.exists(): + raise Exception(f"{source_path.absolute()} does not exist.") + +print("This script will convert all your pdf files to txt files in the same directory.") +if input("Continue? [Y/n]: ").strip().lower() in ["n", "no"]: + print("Operation cancelled.") + exit(0) + +for file in pathlib.Path(source_path).glob("*.pdf"): + + source_file = file.absolute() + destination_file = pathlib.Path(str(file.absolute())[:-4] + ".txt") + if destination_file.exists(): + print(f"Already exists, skipping: {destination_file.absolute()}.") + continue + + print(f"Creating to {destination_file.absolute()}.") + + start = time.time() + try: + pdf_to_txt(source_file, destination_file) + except Exception as ex: + print(ex) + # Just continue, who cares + + finish = time.time() + duration = finish - start + print(f"Took {duration}s.") diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..3359bda --- /dev/null +++ b/requirements.txt @@ -0,0 +1,2 @@ +pdfminer.six +