Last version
This commit is contained in:
commit
711c3b4802
1
.gitignore
vendored
Normal file
1
.gitignore
vendored
Normal file
@ -0,0 +1 @@
|
|||||||
|
venv
|
24
README.md
Normal file
24
README.md
Normal file
@ -0,0 +1,24 @@
|
|||||||
|
# PDF2Text
|
||||||
|
|
||||||
|
I've converted 8gb of PDF's to text in one afternoon on a decade old x270 using this script. Performant enough imho. Try to get 8Gb in your LLM and getting it to actually use it. That's the challenge.
|
||||||
|
|
||||||
|
## Convert all PDF's to text
|
||||||
|
This is an script for converting a batch of PDF's to text for machine learning.
|
||||||
|
It only has two dependencies:
|
||||||
|
- python3
|
||||||
|
- pdf.miner (python requirement, specified in requirements.txt file)
|
||||||
|
|
||||||
|
## Installation
|
||||||
|
```bash
|
||||||
|
python3 -m venv .venv
|
||||||
|
source .venv/bin/activate
|
||||||
|
pip install -r requirements.txt
|
||||||
|
```
|
||||||
|
|
||||||
|
## Usage:
|
||||||
|
Activate your virtual environment.
|
||||||
|
```bash
|
||||||
|
source .venv/bin/activate
|
||||||
|
./pdf2text [source/destination dir]
|
||||||
|
```
|
||||||
|
You read that correctly, the source directory is also the destination directory.
|
51
pdf2text
Executable file
51
pdf2text
Executable file
@ -0,0 +1,51 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
import pathlib
|
||||||
|
import sys
|
||||||
|
from pdfminer.high_level import extract_text
|
||||||
|
import time
|
||||||
|
|
||||||
|
|
||||||
|
def pdf_to_txt(pdf_path, txt_path):
|
||||||
|
try:
|
||||||
|
text = extract_text(pdf_path)
|
||||||
|
with txt_path.open("w+", encoding="utf-8") as f:
|
||||||
|
f.write(text)
|
||||||
|
except Exception as ex:
|
||||||
|
raise
|
||||||
|
return True
|
||||||
|
|
||||||
|
|
||||||
|
try:
|
||||||
|
source_path = sys.argv[1]
|
||||||
|
except IndexError:
|
||||||
|
raise Exception(f"Usage: pdf2text [path].")
|
||||||
|
|
||||||
|
source_path = pathlib.Path(source_path)
|
||||||
|
if not source_path.exists():
|
||||||
|
raise Exception(f"{source_path.absolute()} does not exist.")
|
||||||
|
|
||||||
|
print("This script will convert all your pdf files to txt files in the same directory.")
|
||||||
|
if input("Continue? [Y/n]: ").strip().lower() in ["n", "no"]:
|
||||||
|
print("Operation cancelled.")
|
||||||
|
exit(0)
|
||||||
|
|
||||||
|
for file in pathlib.Path(source_path).glob("*.pdf"):
|
||||||
|
|
||||||
|
source_file = file.absolute()
|
||||||
|
destination_file = pathlib.Path(str(file.absolute())[:-4] + ".txt")
|
||||||
|
if destination_file.exists():
|
||||||
|
print(f"Already exists, skipping: {destination_file.absolute()}.")
|
||||||
|
continue
|
||||||
|
|
||||||
|
print(f"Creating to {destination_file.absolute()}.")
|
||||||
|
|
||||||
|
start = time.time()
|
||||||
|
try:
|
||||||
|
pdf_to_txt(source_file, destination_file)
|
||||||
|
except Exception as ex:
|
||||||
|
print(ex)
|
||||||
|
# Just continue, who cares
|
||||||
|
|
||||||
|
finish = time.time()
|
||||||
|
duration = finish - start
|
||||||
|
print(f"Took {duration}s.")
|
2
requirements.txt
Normal file
2
requirements.txt
Normal file
@ -0,0 +1,2 @@
|
|||||||
|
pdfminer.six
|
||||||
|
|
Loading…
Reference in New Issue
Block a user