mirror of
https://github.com/Ladebeze66/ragflow_preprocess.git
synced 2026-02-04 10:10:29 +01:00
254 lines
7.4 KiB
Python
254 lines
7.4 KiB
Python
import multiprocessing
|
|
import os
|
|
import time
|
|
|
|
import pymupdf
|
|
|
|
|
|
# Support for concurrent processing of document pages.
|
|
#
|
|
|
|
class _worker_State:
|
|
pass
|
|
_worker_state = _worker_State()
|
|
|
|
|
|
def _worker_init(
|
|
path,
|
|
initfn,
|
|
initfn_args,
|
|
initfn_kwargs,
|
|
pagefn,
|
|
pagefn_args,
|
|
pagefn_kwargs,
|
|
stats,
|
|
):
|
|
# pylint: disable=attribute-defined-outside-init
|
|
_worker_state.path = path
|
|
_worker_state.pagefn = pagefn
|
|
_worker_state.pagefn_args = pagefn_args
|
|
_worker_state.pagefn_kwargs = pagefn_kwargs
|
|
_worker_state.stats = stats
|
|
_worker_state.document = None
|
|
if initfn:
|
|
initfn(*initfn_args, **initfn_kwargs)
|
|
|
|
|
|
def _stats_write(t, label):
|
|
t = time.time() - t
|
|
if t >= 10:
|
|
pymupdf.log(f'{os.getpid()=}: {t:2f}s: {label}.')
|
|
|
|
|
|
def _worker_fn(page_number):
|
|
# Create Document from filename if we haven't already done so.
|
|
if not _worker_state.document:
|
|
if _worker_state.stats:
|
|
t = time.time()
|
|
_worker_state.document = pymupdf.Document(_worker_state.path) # pylint: disable=attribute-defined-outside-init
|
|
if _worker_state.stats:
|
|
_stats_write(t, 'pymupdf.Document()')
|
|
|
|
if _worker_state.stats:
|
|
t = time.time()
|
|
page = _worker_state.document[page_number]
|
|
if _worker_state.stats:
|
|
_stats_write(t, '_worker_state.document[page_number]')
|
|
|
|
if _worker_state.stats:
|
|
t = time.time()
|
|
ret = _worker_state.pagefn(
|
|
page,
|
|
*_worker_state.pagefn_args,
|
|
**_worker_state.pagefn_kwargs,
|
|
)
|
|
if _worker_state.stats:
|
|
_stats_write(t, '_worker_state.pagefn()')
|
|
|
|
return ret
|
|
|
|
|
|
def _multiprocessing(
|
|
path,
|
|
pages,
|
|
pagefn,
|
|
pagefn_args,
|
|
pagefn_kwargs,
|
|
initfn,
|
|
initfn_args,
|
|
initfn_kwargs,
|
|
concurrency,
|
|
stats,
|
|
):
|
|
#print(f'_worker_mp(): {concurrency=}', flush=1)
|
|
with multiprocessing.Pool(
|
|
concurrency,
|
|
_worker_init,
|
|
(
|
|
path,
|
|
initfn, initfn_args, initfn_kwargs,
|
|
pagefn, pagefn_args, pagefn_kwargs,
|
|
stats,
|
|
),
|
|
) as pool:
|
|
result = pool.map_async(_worker_fn, pages)
|
|
return result.get()
|
|
|
|
|
|
def _fork(
|
|
path,
|
|
pages,
|
|
pagefn,
|
|
pagefn_args,
|
|
pagefn_kwargs,
|
|
initfn,
|
|
initfn_args,
|
|
initfn_kwargs,
|
|
concurrency,
|
|
stats,
|
|
):
|
|
verbose = 0
|
|
if concurrency is None:
|
|
concurrency = multiprocessing.cpu_count()
|
|
# We write page numbers to `queue_down` and read `(page_num, text)` from
|
|
# `queue_up`. Workers each repeatedly read the next available page number
|
|
# from `queue_down`, extract the text and write it onto `queue_up`.
|
|
#
|
|
# This is better than pre-allocating a subset of pages to each worker
|
|
# because it ensures there will never be idle workers until we are near the
|
|
# end with fewer pages left than workers.
|
|
#
|
|
queue_down = multiprocessing.Queue()
|
|
queue_up = multiprocessing.Queue()
|
|
def childfn():
|
|
document = None
|
|
if verbose:
|
|
pymupdf.log(f'{os.getpid()=}: {initfn=} {initfn_args=}')
|
|
_worker_init(
|
|
path,
|
|
initfn,
|
|
initfn_args,
|
|
initfn_kwargs,
|
|
pagefn,
|
|
pagefn_args,
|
|
pagefn_kwargs,
|
|
stats,
|
|
)
|
|
while 1:
|
|
if verbose:
|
|
pymupdf.log(f'{os.getpid()=}: calling get().')
|
|
page_num = queue_down.get()
|
|
if verbose:
|
|
pymupdf.log(f'{os.getpid()=}: {page_num=}.')
|
|
if page_num is None:
|
|
break
|
|
try:
|
|
if not document:
|
|
if stats:
|
|
t = time.time()
|
|
document = pymupdf.Document(path)
|
|
if stats:
|
|
_stats_write(t, 'pymupdf.Document(path)')
|
|
|
|
if stats:
|
|
t = time.time()
|
|
page = document[page_num]
|
|
if stats:
|
|
_stats_write(t, 'document[page_num]')
|
|
|
|
if verbose:
|
|
pymupdf.log(f'{os.getpid()=}: {_worker_state=}')
|
|
|
|
if stats:
|
|
t = time.time()
|
|
ret = pagefn(
|
|
page,
|
|
*_worker_state.pagefn_args,
|
|
**_worker_state.pagefn_kwargs,
|
|
)
|
|
if stats:
|
|
_stats_write(t, f'{page_num=} pagefn()')
|
|
except Exception as e:
|
|
if verbose: pymupdf.log(f'{os.getpid()=}: exception {e=}')
|
|
ret = e
|
|
if verbose:
|
|
pymupdf.log(f'{os.getpid()=}: sending {page_num=} {ret=}')
|
|
|
|
queue_up.put( (page_num, ret) )
|
|
|
|
error = None
|
|
|
|
pids = list()
|
|
try:
|
|
# Start child processes.
|
|
if stats:
|
|
t = time.time()
|
|
for i in range(concurrency):
|
|
p = os.fork() # pylint: disable=no-member
|
|
if p == 0:
|
|
# Child process.
|
|
try:
|
|
try:
|
|
childfn()
|
|
except Exception as e:
|
|
pymupdf.log(f'{os.getpid()=}: childfn() => {e=}')
|
|
raise
|
|
finally:
|
|
if verbose:
|
|
pymupdf.log(f'{os.getpid()=}: calling os._exit(0)')
|
|
os._exit(0)
|
|
pids.append(p)
|
|
if stats:
|
|
_stats_write(t, 'create child processes')
|
|
|
|
# Send page numbers.
|
|
if stats:
|
|
t = time.time()
|
|
if verbose:
|
|
pymupdf.log(f'Sending page numbers.')
|
|
for page_num in range(len(pages)):
|
|
queue_down.put(page_num)
|
|
if stats:
|
|
_stats_write(t, 'Send page numbers')
|
|
|
|
# Collect results. We give up if any worker sends an exception instead
|
|
# of text, but this hasn't been tested.
|
|
ret = [None] * len(pages)
|
|
for i in range(len(pages)):
|
|
page_num, text = queue_up.get()
|
|
if verbose:
|
|
pymupdf.log(f'{page_num=} {len(text)=}')
|
|
assert ret[page_num] is None
|
|
if isinstance(text, Exception):
|
|
if not error:
|
|
error = text
|
|
break
|
|
ret[page_num] = text
|
|
|
|
# Close queue. This should cause exception in workers and terminate
|
|
# them, but on macos-arm64 this does not seem to happen, so we also
|
|
# send None, which makes workers terminate.
|
|
for i in range(concurrency):
|
|
queue_down.put(None)
|
|
if verbose: pymupdf.log(f'Closing queues.')
|
|
queue_down.close()
|
|
|
|
if error:
|
|
raise error
|
|
if verbose:
|
|
pymupdf.log(f'After concurrent, returning {len(ret)=}')
|
|
return ret
|
|
|
|
finally:
|
|
# Join all child processes.
|
|
if stats:
|
|
t = time.time()
|
|
for pid in pids:
|
|
if verbose:
|
|
pymupdf.log(f'waiting for {pid=}.')
|
|
e = os.waitpid(pid, 0)
|
|
if verbose:
|
|
pymupdf.log(f'{pid=} => {e=}')
|
|
if stats:
|
|
_stats_write(t, 'Join all child proceses')
|