Last version

2024-11-22 20:37:42 +01:00 · 2024-11-22 20:37:42 +01:00 · 711c3b4802
commit 711c3b4802
4 changed files with 78 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1 @@
 venv
--- a/README.md
+++ b/README.md
@ -0,0 +1,24 @@
 # PDF2Text
 I've converted 8gb of PDF's to text in one afternoon on a decade old x270 using this script. Performant enough imho. Try to get 8Gb in your LLM and getting it to actually use it. That's the challenge.
 ## Convert all PDF's to text
 This is an script for converting a batch of PDF's to text for machine learning.
 It only has two dependencies:
 - python3
 - pdf.miner (python requirement, specified in requirements.txt file) 
 ## Installation
 ```bash
 python3 -m venv .venv
 source .venv/bin/activate
 pip install -r requirements.txt
 ```
 ## Usage:
 Activate your virtual environment.
 ```bash
 source .venv/bin/activate
 ./pdf2text [source/destination dir]
 ```
 You read that correctly, the source directory is also the destination directory.
--- a/51
+++ b/51
@ -0,0 +1,51 @@
 #!/usr/bin/env python
 import pathlib
 import sys
 from pdfminer.high_level import extract_text
 import time
 def pdf_to_txt(pdf_path, txt_path):
    try:
        text = extract_text(pdf_path)
        with txt_path.open("w+", encoding="utf-8") as f:
            f.write(text)
    except Exception as ex:
        raise
    return True
 try:
    source_path = sys.argv[1]
 except IndexError:
    raise Exception(f"Usage: pdf2text [path].")
 source_path = pathlib.Path(source_path)
 if not source_path.exists():
    raise Exception(f"{source_path.absolute()} does not exist.")
 print("This script will convert all your pdf files to txt files in the same directory.")
 if input("Continue? [Y/n]: ").strip().lower() in ["n", "no"]:
    print("Operation cancelled.")
    exit(0)
 for file in pathlib.Path(source_path).glob("*.pdf"):
    source_file = file.absolute()
    destination_file = pathlib.Path(str(file.absolute())[:-4] + ".txt")
    if destination_file.exists():
        print(f"Already exists, skipping: {destination_file.absolute()}.")
        continue
    print(f"Creating to {destination_file.absolute()}.")
    start = time.time()
    try:
        pdf_to_txt(source_file, destination_file)
    except Exception as ex:
        print(ex)
        # Just continue, who cares
    finish = time.time()
    duration = finish - start
    print(f"Took {duration}s.")
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1,2 @@
 pdfminer.six