webui : handle PDF input (as text or image) + convert pasted long content to file (#13562)

* webui : handle PDF input (as text or image)

* handle the case where pdf image + server without mtmd

* fix bug missing pages
This commit is contained in:
Xuan-Son Nguyen
2025-05-15 14:24:50 +02:00
committed by GitHub
parent c753d7bed0
commit 3cc1f1f1d2
8 changed files with 425 additions and 22 deletions

View File

@ -16,6 +16,8 @@ export const CONFIG_DEFAULT = {
showTokensPerSecond: false,
showThoughtInProgress: false,
excludeThoughtOnReq: true,
pasteLongTextToFileLen: 2500,
pdfAsImage: false,
// make sure these default values are in sync with `common.h`
samplers: 'edkypmxt',
temperature: 0.8,
@ -43,6 +45,8 @@ export const CONFIG_DEFAULT = {
export const CONFIG_INFO: Record<string, string> = {
apiKey: 'Set the API Key if you are using --api-key option for the server.',
systemMessage: 'The starting message that defines how model should behave.',
pasteLongTextToFileLen:
'On pasting long text, it will be converted to a file. You can control the file length by setting the value of this parameter. Value 0 means disable.',
samplers:
'The order at which samplers are applied, in simplified way. Default is "dkypmxt": dry->top_k->typ_p->top_p->min_p->xtc->temperature',
temperature:

View File

@ -306,6 +306,7 @@ function ChatInput({
onStop: () => void;
isGenerating: boolean;
}) {
const { config } = useAppContext();
const [isDrag, setIsDrag] = useState(false);
return (
@ -328,7 +329,28 @@ function ChatInput({
{({ getRootProps, getInputProps }) => (
<div
className="flex flex-col rounded-xl border-1 border-base-content/30 p-3 w-full"
// when a file is pasted to the input, we handle it here
// if a text is pasted, and if it is long text, we will convert it to a file
onPasteCapture={(e: ClipboardEvent<HTMLInputElement>) => {
const text = e.clipboardData.getData('text/plain');
if (
text.length > 0 &&
config.pasteLongTextToFileLen > 0 &&
text.length > config.pasteLongTextToFileLen
) {
// if the text is too long, we will convert it to a file
extraContext.addItems([
{
type: 'context',
name: 'Pasted Content',
content: text,
},
]);
e.preventDefault();
return;
}
// if a file is pasted, we will handle it here
const files = Array.from(e.clipboardData.items)
.filter((item) => item.kind === 'file')
.map((item) => item.getAsFile())

View File

@ -100,6 +100,16 @@ const SETTING_SECTIONS: SettingSection[] = [
key,
}) as SettingFieldInput
),
{
type: SettingInputType.SHORT_INPUT,
label: 'Paste length to file',
key: 'pasteLongTextToFileLen',
},
{
type: SettingInputType.CHECKBOX,
label: 'Parse PDF as image instead of text',
key: 'pdfAsImage',
},
],
},
{
@ -452,10 +462,10 @@ function SettingsModalLongInput({
label?: string;
}) {
return (
<label className="form-control mb-2">
<div className="label inline">{label || configKey}</div>
<label className="form-control">
<div className="label inline text-sm">{label || configKey}</div>
<textarea
className="textarea textarea-bordered h-24"
className="textarea textarea-bordered h-24 mb-2"
placeholder={`Default: ${CONFIG_DEFAULT[configKey] || 'none'}`}
value={value}
onChange={(e) => onChange(e.target.value)}
@ -482,9 +492,7 @@ function SettingsModalShortInput({
<>
{/* on mobile, we simply show the help message here */}
{helpMsg && (
<div className="block md:hidden mb-1">
<b>{label || configKey}</b>
<br />
<div className="block mb-1 opacity-75">
<p className="text-xs">{helpMsg}</p>
</div>
)}
@ -493,11 +501,6 @@ function SettingsModalShortInput({
<div tabIndex={0} role="button" className="font-bold hidden md:block">
{label || configKey}
</div>
{helpMsg && (
<div className="dropdown-content menu bg-base-100 rounded-box z-10 w-64 p-2 shadow mt-4">
{helpMsg}
</div>
)}
</div>
<input
type="text"

View File

@ -2,6 +2,17 @@ import { useState } from 'react';
import { MessageExtra } from '../utils/types';
import toast from 'react-hot-toast';
import { useAppContext } from '../utils/app.context';
import * as pdfjs from 'pdfjs-dist';
import pdfjsWorkerSrc from 'pdfjs-dist/build/pdf.worker.min.mjs?url';
import { TextContent, TextItem } from 'pdfjs-dist/types/src/display/api';
pdfjs.GlobalWorkerOptions.workerSrc = pdfjsWorkerSrc;
// This file handles uploading extra context items (a.k.a files)
// It allows processing these kinds of files:
// - image files (converted to base64)
// - text files (including code files)
// - pdf (converted to text)
// Interface describing the API returned by the hook
export interface ChatExtraContextApi {
@ -13,7 +24,7 @@ export interface ChatExtraContextApi {
}
export function useChatExtraContext(): ChatExtraContextApi {
const { serverProps } = useAppContext();
const { serverProps, config } = useAppContext();
const [items, setItems] = useState<MessageExtra[]>([]);
const addItems = (newItems: MessageExtra[]) => {
@ -28,6 +39,8 @@ export function useChatExtraContext(): ChatExtraContextApi {
setItems([]);
};
const isSupportVision = serverProps?.modalities?.vision;
const onFileAdded = (files: File[]) => {
for (const file of files) {
const mimeType = file.type;
@ -38,7 +51,7 @@ export function useChatExtraContext(): ChatExtraContextApi {
}
if (mimeType.startsWith('image/')) {
if (!serverProps?.modalities?.vision) {
if (!isSupportVision) {
toast.error('Multimodal is not supported by this server or model.');
break;
}
@ -69,7 +82,43 @@ export function useChatExtraContext(): ChatExtraContextApi {
toast.error('Video and audio files are not supported yet.');
break;
} else if (mimeType.startsWith('application/pdf')) {
toast.error('PDF files are not supported yet.');
if (config.pdfAsImage && !isSupportVision) {
toast(
'Multimodal is not supported, PDF will be converted to text instead of image.'
);
break;
}
const promise =
config.pdfAsImage && isSupportVision
? convertPDFToImage(file).then((base64Urls) => {
addItems(
base64Urls.map((base64Url) => ({
type: 'imageFile',
name: file.name,
base64Url,
}))
);
})
: convertPDFToText(file).then((content) => {
if (isSupportVision) {
toast.success(
'PDF file converted to text. You can also convert it to image, see in Settings.'
);
}
addItems([
{
type: 'textFile',
name: file.name,
content,
},
]);
});
promise.catch((error) => {
console.error(error);
toast.error('Failed to parse PDF file.');
});
break;
} else {
// Because there can be many text file types (like code file), we will not check the mime type
@ -105,11 +154,69 @@ export function useChatExtraContext(): ChatExtraContextApi {
};
}
async function getFileAsBuffer(file: File): Promise<ArrayBuffer> {
return new Promise((resolve, reject) => {
const reader = new FileReader();
reader.onload = (event) => {
if (event.target?.result) {
resolve(event.target.result as ArrayBuffer);
} else {
reject(new Error('Failed to read file.'));
}
};
reader.readAsArrayBuffer(file);
});
}
async function convertPDFToText(file: File): Promise<string> {
const buffer = await getFileAsBuffer(file);
const pdf = await pdfjs.getDocument(buffer).promise;
const numPages = pdf.numPages;
const textContentPromises: Promise<TextContent>[] = [];
for (let i = 1; i <= numPages; i++) {
textContentPromises.push(
pdf.getPage(i).then((page) => page.getTextContent())
);
}
const textContents = await Promise.all(textContentPromises);
const textItems = textContents.flatMap((textContent: TextContent) =>
textContent.items.map((item) => (item as TextItem).str ?? '')
);
return textItems.join('\n');
}
// returns list of base64 images
async function convertPDFToImage(file: File): Promise<string[]> {
const buffer = await getFileAsBuffer(file);
const doc = await pdfjs.getDocument(buffer).promise;
const pages: Promise<string>[] = [];
for (let i = 1; i <= doc.numPages; i++) {
const page = await doc.getPage(i);
const viewport = page.getViewport({ scale: 1.5 });
const canvas = document.createElement('canvas');
const ctx = canvas.getContext('2d');
canvas.width = viewport.width;
canvas.height = viewport.height;
if (!ctx) {
throw new Error('Failed to get 2D context from canvas');
}
const task = page.render({ canvasContext: ctx, viewport: viewport });
pages.push(
task.promise.then(() => {
return canvas.toDataURL();
})
);
}
return await Promise.all(pages);
}
// WARN: vibe code below
// This code is a heuristic to determine if a string is likely not binary.
// It is necessary because input file can have various mime types which we don't have time to investigate.
// For example, a python file can be text/plain, application/x-python, etc.
export function isLikelyNotBinary(str: string): boolean {
function isLikelyNotBinary(str: string): boolean {
const options = {
prefixLength: 1024 * 10, // Check the first 10KB of the string
suspiciousCharThresholdRatio: 0.15, // Allow up to 15% suspicious chars